利用Python进行数据分析的学习笔记——chap6

最新推荐文章于 2023-06-26 18:11:39 发布

调停者จุ๊บ

最新推荐文章于 2023-06-26 18:11:39 发布

阅读量319

点赞数

分类专栏：笔记文章标签： python 数据挖掘机器学习

本文链接：https://blog.csdn.net/hlinging/article/details/123171306

版权

笔记专栏收录该内容

15 篇文章 1 订阅

订阅专栏

读写文本格式的数据

在这里插入图片描述

import pandas as pd
import numpy as np
from pandas import Series,DataFrame

!type "E:\python_study_files\python\pydata-book-2nd-edition\examples\ex1.csv"

a,b,c,d,message
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo

df = pd.read_csv("E:\python_study_files\python\pydata-book-2nd-edition\examples\ex1.csv")
#等价于
df = pd.read_table("E:\python_study_files\python\pydata-book-2nd-edition\examples\ex1.csv",sep=',')

#对于无标题行文件
pd.read_csv("E:\python_study_files\python\pydata-book-2nd-edition\examples\ex2.csv",header=None)

	0	1	2	3	4
0	1	2	3	4	hello
1	5	6	7	8	world
2	9	10	11	12	foo

#自定义列名
pd.read_csv("E:\python_study_files\python\pydata-book-2nd-edition\examples\ex2.csv",names=['a','b','c','d','message'])

	a	b	c	d	message
0	1	2	3	4	hello
1	5	6	7	8	world
2	9	10	11	12	foo

names=['a','b','c','d','message']
pd.read_csv("E:\python_study_files\python\pydata-book-2nd-edition\examples\ex2.csv",names=names,index_col='message')

	a	b	c	d
message
hello	1	2	3	4
world	5	6	7	8
foo	9	10	11	12

#做一个层次化索引。传入由列编号或列名组成的列表
parsed = pd.read_csv("E:\python_study_files\python\pydata-book-2nd-edition\examples\csv_mindex.csv",index_col=['key1','key2'])
parsed

		value1	value2
key1	key2
one	a	1	2
	b	3	4
	c	5	6
	d	7	8
two	a	9	10
	b	11	12
	c	13	14
	d	15	16

list(open("E:\python_study_files\python\pydata-book-2nd-edition\examples\ex3.txt"))

['            A         B         C\n',
 'aaa -0.264438 -1.026059 -0.619500\n',
 'bbb  0.927272  0.302904 -0.032399\n',
 'ccc -0.264273 -0.386314 -0.217601\n',
 'ddd -0.871858 -0.348382  1.100491\n']

#文件各个字段由数量不定的空白符分隔
#可以用正则表达式\s+
result = pd.read_table("E:\python_study_files\python\pydata-book-2nd-edition\examples\ex3.txt",sep='\s+')
result

	A	B	C
aaa	-0.264438	-1.026059	-0.619500
bbb	0.927272	0.302904	-0.032399
ccc	-0.264273	-0.386314	-0.217601
ddd	-0.871858	-0.348382	1.100491

#读取文件的跳行操作
pd.read_csv("E:\python_study_files\python\pydata-book-2nd-edition\examples\ex4.csv",skiprows=[0,2,3])

	a	b	c	d	message
0	1	2	3	4	hello
1	5	6	7	8	world
2	9	10	11	12	foo

#含缺失值的数据
result = pd.read_csv("E:\python_study_files\python\pydata-book-2nd-edition\examples\ex5.csv")
result

	something	a	b	c	d	message
0	one	1	2	3.0	4	NaN
1	two	5	6	NaN	8	world
2	three	9	10	11.0	12	foo

pd.isnull(result)

	something	a	b	c	d	message
0	False	False	False	False	False	True
1	False	False	False	True	False	False
2	False	False	False	False	False	False

pandas会用一组经常出现的标记值进行识别。如NA、-1.#IND以及NULL和空字符串等

#na_values可以接受一组用于表示缺失值的字符串
result = pd.read_csv("E:\python_study_files\python\pydata-book-2nd-edition\examples\ex5.csv",na_values=['NULL'])
pd.isnull(result)

	something	a	b	c	d	message
0	False	False	False	False	False	True
1	False	False	False	True	False	True
2	False	False	False	False	False	False

#可以用一个字典为各列指定不同的NA标记值
sentinels = {'message':['foo','NA'],'something':['two']}
pd.read_csv("E:\python_study_files\python\pydata-book-2nd-edition\examples\ex5.csv",na_values=sentinels)

	something	a	b	c	d	message
0	one	1	2	3.0	4	NaN
1	NaN	5	6	NaN	8	world
2	three	9	10	11.0	12	NaN

#有个知识点
import matplotlib.pyplot as plt
from pylab import *
img = plt.imread('read_csv或read_table函数的参数.png')
imshow(img)

<matplotlib.image.AxesImage at 0x1e8fd724130>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-etaY4igA-1645969826326)(output_17_1.png)]

img = plt.imread('read_csv或read_table函数的参数2.png')
imshow(img)

<matplotlib.image.AxesImage at 0x1e8fd712770>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-WniQIXjq-1645969826327)(output_18_1.png)]

逐块读取文本文件

result = pd.read_csv("E:\python_study_files\python\pydata-book-2nd-edition\examples\ex6.csv")
#读取前几行
pd.read_csv("E:\python_study_files\python\pydata-book-2nd-edition\examples\ex6.csv",nrows=5)

	one	two	three	four	key
0	0.467976	-0.038649	-0.295344	-1.824726	L
1	-0.358893	1.404453	0.704965	-0.200638	B
2	-0.501840	0.659254	-0.421691	-0.057688	G
3	0.204886	1.074134	1.388361	-0.982404	R
4	0.354628	-0.133116	0.283763	-0.837063	Q

#逐块读取文件
chunker = pd.read_csv("E:\python_study_files\python\pydata-book-2nd-edition\examples\ex6.csv",chunksize=1000)
chunker

<pandas.io.parsers.readers.TextFileReader at 0x2eac66984f0>

#返回的是TextParser对象，对文件进行逐块迭代
#将值计数聚合到key列
tot = Series([])
for piece in chunker:
    #add() 方法向集合添加元素。如果该元素已存在，则 add() 方法就不会添加元素。计数函数value_counts()
    tot = tot.add(piece['key'].value_counts(),fill_value=0)

tot = tot.sort_values(ascending=False)

tot[:10]

E    368.0
X    364.0
L    346.0
O    343.0
Q    340.0
M    338.0
J    337.0
F    335.0
K    334.0
H    330.0
dtype: float64

get_chunk方法可以读取任意大小的块

将数据写出到文本格式

data = pd.read_csv("E:\python_study_files\python\pydata-book-2nd-edition\examples\ex5.csv")
#将数据写到一个以逗号分隔的文件中
data.to_csv("E:\python_study_files\ipython_data_analysis\out.csv")

import sys
data.to_csv(sys.stdout,sep='|')

|something|a|b|c|d|message
0|one|1|2|3.0|4|
1|two|5|6||8|world
2|three|9|10|11.0|12|foo

data.to_csv(sys.stdout,na_rep='NULL')

,something,a,b,c,d,message
0,one,1,2,3.0,4,NULL
1,two,5,6,NULL,8,world
2,three,9,10,11.0,12,foo

data.to_csv(sys.stdout,index=False,header=False)

one,1,2,3.0,4,
two,5,6,,8,world
three,9,10,11.0,12,foo

data.to_csv(sys.stdout,index=False,columns=['a','b','c'])

a,b,c
1,2,3.0
5,6,
9,10,11.0
9,10,11.0

#Series也有一个to_csv方法
#生成时间2000/1/1-2000/1/7
dates = pd.date_range('1/1/2000',periods=7)
ts = Series(np.arange(7),index=dates)
ts.to_csv("E:\\python_study_files\\ipython_data_analysis\\tseries.csv")
!type "E:\python_study_files\ipython_data_analysis\tseries.csv"

,0
2000-01-01,0
2000-01-02,1
2000-01-03,2
2000-01-04,3
2000-01-05,4
2000-01-06,5
2000-01-07,6

dates

DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', '2000-01-04',
               '2000-01-05', '2000-01-06', '2000-01-07'],
              dtype='datetime64[ns]', freq='D')

#read_csv等价于
#Series.from_csv("E:\python_study_files\ipython_data_analysis\tseries.csv",parse_dates=True)
#from_csv无了，用回read_csv
pd.read_csv("E:\\python_study_files\\ipython_data_analysis\\tseries.csv",parse_dates=True)

	Unnamed: 0	0
0	2000-01-01	0
1	2000-01-02	1
2	2000-01-03	2
3	2000-01-04	3
4	2000-01-05	4
5	2000-01-06	5
6	2000-01-07	6

手工处理分隔符格式

!type "E:\python_study_files\python\pydata-book-2nd-edition\examples\ex7.csv"

"a","b","c"
"1","2","3"
"1","2","3"

import csv
f = open("E:\python_study_files\python\pydata-book-2nd-edition\examples\ex7.csv")
reader = csv.reader(f)#reader = csv.reader(f,delimiter=',')
reader

<_csv.reader at 0x1c28677b160>

for line in reader:
    print(line)

['a', 'b', 'c']
['1', '2', '3']
['1', '2', '3']

lines = list(csv.reader(open("E:\python_study_files\python\pydata-book-2nd-edition\examples\ex7.csv")))
header,values = lines[0],lines[1:]
data_dict = {h: v for h, v in zip(header,zip(*values))}
data_dict

{'a': ('1', '1'), 'b': ('2', '2'), 'c': ('3', '3')}

zip(*values)

<zip at 0x1c28671d0c0>

在这里插入图片描述

with open('mydata.csv','w') as f:
    writer = csv.writer(f,dialect=my_dialect)
    writer.writerow(('one','two','three'))
    writer.writerow(('1','2','3'))

---------------------------------------------------                  


NameError: name 'my_dialect' is not defined

JSON数据

obj =  """
{"name":"Wes",
"places_lived":["United States","Spain","Germany"],
"pet":null,
"siblings":[{"name":"Scott","age":25,"pet":"Zuko"},
           {"name":"Katie","age":33,"pet":"Cisco"}]
}
"""

import json
#将json对象转化为python对象
result = json.loads(obj)
result

{'name': 'Wes',
 'places_lived': ['United States', 'Spain', 'Germany'],
 'pet': None,
 'siblings': [{'name': 'Scott', 'age': 25, 'pet': 'Zuko'},
  {'name': 'Katie', 'age': 33, 'pet': 'Cisco'}]}

#将python对象转换为json对象
asjson = json.dumps(result)

siblings = DataFrame(result['siblings'],columns=['name','age'])
siblings

	name	age
0	Scott	25
1	Katie	33

XML和HTML：Web信息收集

from lxml.html import parse
import urllib.request
parsed = parse(urllib.request.urlopen('http://www.stats.gov.cn/tjsj/ndsj/2021/indexch.htm'))
doc = parsed.getroot()

#获取特定类型的所有HTML标签
links = doc.findall('.//a')
links

[]

#要得到URL和链接文本，可使用各对象的get方法（针对URL)和text_content方法（针对显示文本）
lnk = links[28]
lnk.get('href')
lnk.text_content()
#用列表推导式获取文档中的全部URL
urls = [lnk.get('href') for lnk in doc.findall('.//a')]

#TextParser类可用于自动类型转换
from pandas.io.parsers import TextParser

from lxml import objectify
path = 'Performance_MNR.xml'
parsed = objectify.parse(open(path))
root = parsed.getroot()
data = []
skip_fields = ['PARENT_SEQ','INDICATOR_SEQ','DESIRED_CHANGE','DECIMAL_PLACES']
#root.INDICATOR返回一个用于产生各个<INDICATOR>XML元素的生成器
for elt in root.INDICATOR:
    el_data = {}
    for child in elt.getchildren():
        if child.tag in skip_fields:
            continue
        el_data[child.tag] = child.pyval
    data.append(el_data)
perf = DataFrame(data)
perf

	AGENCY_NAME	INDICATOR_NAME	DESCRIPTION	PERIOD_YEAR	PERIOD_MONTH	CATEGORY	FREQUENCY	INDICATOR_UNIT	YTD_TARGET	YTD_ACTUAL	MONTHLY_TARGET	MONTHLY_ACTUAL
0	Metro-North Railroad	On-Time Performance (West of Hudson)	Percent of commuter trains that arrive at thei...	2008	1	Service Indicators	M	%	95.0	96.9	95.0	96.9
1	Metro-North Railroad	On-Time Performance (West of Hudson)	Percent of commuter trains that arrive at thei...	2008	2	Service Indicators	M	%	95.0	96.0	95.0	95.0
2	Metro-North Railroad	On-Time Performance (West of Hudson)	Percent of commuter trains that arrive at thei...	2008	3	Service Indicators	M	%	95.0	96.3	95.0	96.9
3	Metro-North Railroad	On-Time Performance (West of Hudson)	Percent of commuter trains that arrive at thei...	2008	4	Service Indicators	M	%	95.0	96.8	95.0	98.3
4	Metro-North Railroad	On-Time Performance (West of Hudson)	Percent of commuter trains that arrive at thei...	2008	5	Service Indicators	M	%	95.0	96.6	95.0	95.8
...	...	...	...	...	...	...	...	...	...	...	...	...
643	Metro-North Railroad	Escalator Availability	Percent of the time that escalators are operat...	2011	8	Service Indicators	M	%	97.0		97.0
644	Metro-North Railroad	Escalator Availability	Percent of the time that escalators are operat...	2011	9	Service Indicators	M	%	97.0		97.0
645	Metro-North Railroad	Escalator Availability	Percent of the time that escalators are operat...	2011	10	Service Indicators	M	%	97.0		97.0
646	Metro-North Railroad	Escalator Availability	Percent of the time that escalators are operat...	2011	11	Service Indicators	M	%	97.0		97.0
647	Metro-North Railroad	Escalator Availability	Percent of the time that escalators are operat...	2011	12	Service Indicators	M	%	97.0		97.0

648 rows × 12 columns

二进制数据格式

pickle最好只用于短期存储，难以保证该格式永远稳定。

frame = pd.read_csv("E:\python_study_files\python\pydata-book-2nd-edition\examples\ex1.csv")
frame

	a	b	c	d	message
0	1	2	3	4	hello
1	5	6	7	8	world
2	9	10	11	12	foo

#frame.save('frame_pickle')
#只需要将save用to_pickle方法代替即可。同理，load要替换为read_pickle
frame.to_pickle('frame_pickle')#二进制格式化

#读取二进制数据
pd.read_pickle('frame_pickle')

	a	b	c	d	message
0	1	2	3	4	hello
1	5	6	7	8	world
2	9	10	11	12	foo

使用HDF5格式

如果需要处理海量数据，最好看看PyTables和h5py，因为很多数据分析问题都是IO密集型（不是CPU密集型）。但HDF5适合用作“一次写多次读”的数据集。

store = pd.HDFStore('mydata.h5')
store['obj1'] = frame
store['obj1_col'] = frame['a']
store

---------------------------------------------------------------------------


ImportError: Missing optional dependency 'tables'.  Use pip or conda to install tables.

读取Microsoft Excel文件

import openpyxl
xls_file = pd.ExcelFile('data.xlsx')
table = xls_file.parse('Sheet1')

使用HTML和Web API

import requests

url = 'http://www.json.org.cn/resource/json-in-javascript.htm'
resp = requests.get(url)
resp

<Response [200]>

import json
data = json.loads(resp.text)
data.keys()

---------------------------------------------------------------------------


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

使用数据库

import sqlite3
query = """
CREATE TABLE test
(a VARCHAR(20),b VARCHAR(20),
c REAL,        d INTEGER
);"""
con = sqlite3.connect(':memory:')
con.execute(query)
con.commit()

#插入几行数据
data = [('Atlanta','Georgia',1.25,6),
       ('Tallahassee','Florida',2.6,3),
       ('Sacramento','California',1.7,5)]
stmt = "INSERT INTO test VALUES(?,?,?,?)"

con.executemany(stmt,data)
con.commit()

cursor = con.execute('select * from test')
rows = cursor.fetchall()
rows

[('Atlanta', 'Georgia', 1.25, 6),
 ('Tallahassee', 'Florida', 2.6, 3),
 ('Sacramento', 'California', 1.7, 5)]

cursor.description

(('a', None, None, None, None, None, None),
 ('b', None, None, None, None, None, None),
 ('c', None, None, None, None, None, None),
 ('d', None, None, None, None, None, None))

DataFrame(rows,columns=zip(*cursor.description)[0])

---------------------------------------------------------------------------


TypeError: 'zip' object is not subscriptable

存取MongoDB中的数据

#import pymongo
#con = pymongo.Connection('localhost',port=27017)改了
from pymongo import MongoClient
con = MongoClient('localhost',port=27017)

调停者จุ๊บ

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
利用Python进行数据分析的学习笔记——chap6

读写文本格式的数据import pandas as pdimport numpy as npfrom pandas import Series,DataFrame!type "E:\python_study_files\python\pydata-book-2nd-edition\examples\ex1.csv"a,b,c,d,message1,2,3,4,hello5,6,7,8,world9,10,11,12,foodf = pd.read_csv("E:\python_st
复制链接

扫一扫