利用Python进行数据分析的学习笔记——chap6

读写文本格式的数据

在这里插入图片描述

import pandas as pd
import numpy as np
from pandas import Series,DataFrame
!type "E:\python_study_files\python\pydata-book-2nd-edition\examples\ex1.csv"
a,b,c,d,message
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo
df = pd.read_csv("E:\python_study_files\python\pydata-book-2nd-edition\examples\ex1.csv")
#等价于
df = pd.read_table("E:\python_study_files\python\pydata-book-2nd-edition\examples\ex1.csv",sep=',')
#对于无标题行文件
pd.read_csv("E:\python_study_files\python\pydata-book-2nd-edition\examples\ex2.csv",header=None)
01234
01234hello
15678world
29101112foo
#自定义列名
pd.read_csv("E:\python_study_files\python\pydata-book-2nd-edition\examples\ex2.csv",names=['a','b','c','d','message'])
abcdmessage
01234hello
15678world
29101112foo
names=['a','b','c','d','message']
pd.read_csv("E:\python_study_files\python\pydata-book-2nd-edition\examples\ex2.csv",names=names,index_col='message')
abcd
message
hello1234
world5678
foo9101112
#做一个层次化索引。传入由列编号或列名组成的列表
parsed = pd.read_csv("E:\python_study_files\python\pydata-book-2nd-edition\examples\csv_mindex.csv",index_col=['key1','key2'])
parsed
value1value2
key1key2
onea12
b34
c56
d78
twoa910
b1112
c1314
d1516
list(open("E:\python_study_files\python\pydata-book-2nd-edition\examples\ex3.txt"))
['            A         B         C\n',
 'aaa -0.264438 -1.026059 -0.619500\n',
 'bbb  0.927272  0.302904 -0.032399\n',
 'ccc -0.264273 -0.386314 -0.217601\n',
 'ddd -0.871858 -0.348382  1.100491\n']
#文件各个字段由数量不定的空白符分隔
#可以用正则表达式\s+
result = pd.read_table("E:\python_study_files\python\pydata-book-2nd-edition\examples\ex3.txt",sep='\s+')
result
ABC
aaa-0.264438-1.026059-0.619500
bbb0.9272720.302904-0.032399
ccc-0.264273-0.386314-0.217601
ddd-0.871858-0.3483821.100491
#读取文件的跳行操作
pd.read_csv("E:\python_study_files\python\pydata-book-2nd-edition\examples\ex4.csv",skiprows=[0,2,3])
abcdmessage
01234hello
15678world
29101112foo
#含缺失值的数据
result = pd.read_csv("E:\python_study_files\python\pydata-book-2nd-edition\examples\ex5.csv")
result
somethingabcdmessage
0one123.04NaN
1two56NaN8world
2three91011.012foo
pd.isnull(result)
somethingabcdmessage
0FalseFalseFalseFalseFalseTrue
1FalseFalseFalseTrueFalseFalse
2FalseFalseFalseFalseFalseFalse

pandas会用一组经常出现的标记值进行识别。如NA、-1.#IND以及NULL和空字符串等

#na_values可以接受一组用于表示缺失值的字符串
result = pd.read_csv("E:\python_study_files\python\pydata-book-2nd-edition\examples\ex5.csv",na_values=['NULL'])
pd.isnull(result)
somethingabcdmessage
0FalseFalseFalseFalseFalseTrue
1FalseFalseFalseTrueFalseTrue
2FalseFalseFalseFalseFalseFalse
#可以用一个字典为各列指定不同的NA标记值
sentinels = {'message':['foo','NA'],'something':['two']}
pd.read_csv("E:\python_study_files\python\pydata-book-2nd-edition\examples\ex5.csv",na_values=sentinels)
somethingabcdmessage
0one123.04NaN
1NaN56NaN8world
2three91011.012NaN
#有个知识点
import matplotlib.pyplot as plt
from pylab import *
img = plt.imread('read_csv或read_table函数的参数.png')
imshow(img)
<matplotlib.image.AxesImage at 0x1e8fd724130>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-etaY4igA-1645969826326)(output_17_1.png)]

img = plt.imread('read_csv或read_table函数的参数2.png')
imshow(img)
<matplotlib.image.AxesImage at 0x1e8fd712770>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-WniQIXjq-1645969826327)(output_18_1.png)]

逐块读取文本文件

result = pd.read_csv("E:\python_study_files\python\pydata-book-2nd-edition\examples\ex6.csv")
#读取前几行
pd.read_csv("E:\python_study_files\python\pydata-book-2nd-edition\examples\ex6.csv",nrows=5)
onetwothreefourkey
00.467976-0.038649-0.295344-1.824726L
1-0.3588931.4044530.704965-0.200638B
2-0.5018400.659254-0.421691-0.057688G
30.2048861.0741341.388361-0.982404R
40.354628-0.1331160.283763-0.837063Q
#逐块读取文件
chunker = pd.read_csv("E:\python_study_files\python\pydata-book-2nd-edition\examples\ex6.csv",chunksize=1000)
chunker
<pandas.io.parsers.readers.TextFileReader at 0x2eac66984f0>
#返回的是TextParser对象,对文件进行逐块迭代
#将值计数聚合到key列
tot = Series([])
for piece in chunker:
    #add() 方法向集合添加元素。如果该元素已存在,则 add() 方法就不会添加元素。计数函数value_counts()
    tot = tot.add(piece['key'].value_counts(),fill_value=0)

tot = tot.sort_values(ascending=False)
tot[:10]
E    368.0
X    364.0
L    346.0
O    343.0
Q    340.0
M    338.0
J    337.0
F    335.0
K    334.0
H    330.0
dtype: float64

get_chunk方法可以读取任意大小的块

将数据写出到文本格式

data = pd.read_csv("E:\python_study_files\python\pydata-book-2nd-edition\examples\ex5.csv")
#将数据写到一个以逗号分隔的文件中
data.to_csv("E:\python_study_files\ipython_data_analysis\out.csv")
import sys
data.to_csv(sys.stdout,sep='|')
|something|a|b|c|d|message
0|one|1|2|3.0|4|
1|two|5|6||8|world
2|three|9|10|11.0|12|foo
data.to_csv(sys.stdout,na_rep='NULL')
,something,a,b,c,d,message
0,one,1,2,3.0,4,NULL
1,two,5,6,NULL,8,world
2,three,9,10,11.0,12,foo
data.to_csv(sys.stdout,index=False,header=False)
one,1,2,3.0,4,
two,5,6,,8,world
three,9,10,11.0,12,foo
data.to_csv(sys.stdout,index=False,columns=['a','b','c'])
a,b,c
1,2,3.0
5,6,
9,10,11.0
9,10,11.0
#Series也有一个to_csv方法
#生成时间2000/1/1-2000/1/7
dates = pd.date_range('1/1/2000',periods=7)
ts = Series(np.arange(7),index=dates)
ts.to_csv("E:\\python_study_files\\ipython_data_analysis\\tseries.csv")
!type "E:\python_study_files\ipython_data_analysis\tseries.csv"
,0
2000-01-01,0
2000-01-02,1
2000-01-03,2
2000-01-04,3
2000-01-05,4
2000-01-06,5
2000-01-07,6
dates
DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', '2000-01-04',
               '2000-01-05', '2000-01-06', '2000-01-07'],
              dtype='datetime64[ns]', freq='D')
#read_csv等价于
#Series.from_csv("E:\python_study_files\ipython_data_analysis\tseries.csv",parse_dates=True)
#from_csv无了,用回read_csv
pd.read_csv("E:\\python_study_files\\ipython_data_analysis\\tseries.csv",parse_dates=True)
Unnamed: 00
02000-01-010
12000-01-021
22000-01-032
32000-01-043
42000-01-054
52000-01-065
62000-01-076

手工处理分隔符格式

!type "E:\python_study_files\python\pydata-book-2nd-edition\examples\ex7.csv"
"a","b","c"
"1","2","3"
"1","2","3"
import csv
f = open("E:\python_study_files\python\pydata-book-2nd-edition\examples\ex7.csv")
reader = csv.reader(f)#reader = csv.reader(f,delimiter=',')
reader
<_csv.reader at 0x1c28677b160>
for line in reader:
    print(line)
['a', 'b', 'c']
['1', '2', '3']
['1', '2', '3']
lines = list(csv.reader(open("E:\python_study_files\python\pydata-book-2nd-edition\examples\ex7.csv")))
header,values = lines[0],lines[1:]
data_dict = {h: v for h, v in zip(header,zip(*values))}
data_dict
{'a': ('1', '1'), 'b': ('2', '2'), 'c': ('3', '3')}
zip(*values)
<zip at 0x1c28671d0c0>

在这里插入图片描述

with open('mydata.csv','w') as f:
    writer = csv.writer(f,dialect=my_dialect)
    writer.writerow(('one','two','three'))
    writer.writerow(('1','2','3'))
---------------------------------------------------                  


NameError: name 'my_dialect' is not defined

JSON数据

obj =  """
{"name":"Wes",
"places_lived":["United States","Spain","Germany"],
"pet":null,
"siblings":[{"name":"Scott","age":25,"pet":"Zuko"},
           {"name":"Katie","age":33,"pet":"Cisco"}]
}
"""
import json
#将json对象转化为python对象
result = json.loads(obj)
result
{'name': 'Wes',
 'places_lived': ['United States', 'Spain', 'Germany'],
 'pet': None,
 'siblings': [{'name': 'Scott', 'age': 25, 'pet': 'Zuko'},
  {'name': 'Katie', 'age': 33, 'pet': 'Cisco'}]}
#将python对象转换为json对象
asjson = json.dumps(result)
siblings = DataFrame(result['siblings'],columns=['name','age'])
siblings
nameage
0Scott25
1Katie33

XML和HTML:Web信息收集

from lxml.html import parse
import urllib.request
parsed = parse(urllib.request.urlopen('http://www.stats.gov.cn/tjsj/ndsj/2021/indexch.htm'))
doc = parsed.getroot()
#获取特定类型的所有HTML标签
links = doc.findall('.//a')
links
[]
#要得到URL和链接文本,可使用各对象的get方法(针对URL)和text_content方法(针对显示文本)
lnk = links[28]
lnk.get('href')
lnk.text_content()
#用列表推导式获取文档中的全部URL
urls = [lnk.get('href') for lnk in doc.findall('.//a')]
#TextParser类可用于自动类型转换
from pandas.io.parsers import TextParser
from lxml import objectify
path = 'Performance_MNR.xml'
parsed = objectify.parse(open(path))
root = parsed.getroot()
data = []
skip_fields = ['PARENT_SEQ','INDICATOR_SEQ','DESIRED_CHANGE','DECIMAL_PLACES']
#root.INDICATOR返回一个用于产生各个<INDICATOR>XML元素的生成器
for elt in root.INDICATOR:
    el_data = {}
    for child in elt.getchildren():
        if child.tag in skip_fields:
            continue
        el_data[child.tag] = child.pyval
    data.append(el_data)
perf = DataFrame(data)
perf
AGENCY_NAMEINDICATOR_NAMEDESCRIPTIONPERIOD_YEARPERIOD_MONTHCATEGORYFREQUENCYINDICATOR_UNITYTD_TARGETYTD_ACTUALMONTHLY_TARGETMONTHLY_ACTUAL
0Metro-North RailroadOn-Time Performance (West of Hudson)Percent of commuter trains that arrive at thei...20081Service IndicatorsM%95.096.995.096.9
1Metro-North RailroadOn-Time Performance (West of Hudson)Percent of commuter trains that arrive at thei...20082Service IndicatorsM%95.096.095.095.0
2Metro-North RailroadOn-Time Performance (West of Hudson)Percent of commuter trains that arrive at thei...20083Service IndicatorsM%95.096.395.096.9
3Metro-North RailroadOn-Time Performance (West of Hudson)Percent of commuter trains that arrive at thei...20084Service IndicatorsM%95.096.895.098.3
4Metro-North RailroadOn-Time Performance (West of Hudson)Percent of commuter trains that arrive at thei...20085Service IndicatorsM%95.096.695.095.8
.......................................
643Metro-North RailroadEscalator AvailabilityPercent of the time that escalators are operat...20118Service IndicatorsM%97.097.0
644Metro-North RailroadEscalator AvailabilityPercent of the time that escalators are operat...20119Service IndicatorsM%97.097.0
645Metro-North RailroadEscalator AvailabilityPercent of the time that escalators are operat...201110Service IndicatorsM%97.097.0
646Metro-North RailroadEscalator AvailabilityPercent of the time that escalators are operat...201111Service IndicatorsM%97.097.0
647Metro-North RailroadEscalator AvailabilityPercent of the time that escalators are operat...201112Service IndicatorsM%97.097.0

648 rows × 12 columns

二进制数据格式

pickle最好只用于短期存储,难以保证该格式永远稳定。

frame = pd.read_csv("E:\python_study_files\python\pydata-book-2nd-edition\examples\ex1.csv")
frame
abcdmessage
01234hello
15678world
29101112foo
#frame.save('frame_pickle')
#只需要将save用to_pickle方法代替即可。同理,load要替换为read_pickle
frame.to_pickle('frame_pickle')#二进制格式化
#读取二进制数据
pd.read_pickle('frame_pickle')
abcdmessage
01234hello
15678world
29101112foo

使用HDF5格式

如果需要处理海量数据,最好看看PyTables和h5py,因为很多数据分析问题都是IO密集型(不是CPU密集型)。但HDF5适合用作“一次写多次读”的数据集。

store = pd.HDFStore('mydata.h5')
store['obj1'] = frame
store['obj1_col'] = frame['a']
store
---------------------------------------------------------------------------


ImportError: Missing optional dependency 'tables'.  Use pip or conda to install tables.

读取Microsoft Excel文件

import openpyxl
xls_file = pd.ExcelFile('data.xlsx')
table = xls_file.parse('Sheet1')

使用HTML和Web API

import requests
url = 'http://www.json.org.cn/resource/json-in-javascript.htm'
resp = requests.get(url)
resp
<Response [200]>
import json
data = json.loads(resp.text)
data.keys()
---------------------------------------------------------------------------


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

使用数据库

import sqlite3
query = """
CREATE TABLE test
(a VARCHAR(20),b VARCHAR(20),
c REAL,        d INTEGER
);"""
con = sqlite3.connect(':memory:')
con.execute(query)
con.commit()
#插入几行数据
data = [('Atlanta','Georgia',1.25,6),
       ('Tallahassee','Florida',2.6,3),
       ('Sacramento','California',1.7,5)]
stmt = "INSERT INTO test VALUES(?,?,?,?)"

con.executemany(stmt,data)
con.commit()
cursor = con.execute('select * from test')
rows = cursor.fetchall()
rows
[('Atlanta', 'Georgia', 1.25, 6),
 ('Tallahassee', 'Florida', 2.6, 3),
 ('Sacramento', 'California', 1.7, 5)]
cursor.description
(('a', None, None, None, None, None, None),
 ('b', None, None, None, None, None, None),
 ('c', None, None, None, None, None, None),
 ('d', None, None, None, None, None, None))
DataFrame(rows,columns=zip(*cursor.description)[0])
---------------------------------------------------------------------------


TypeError: 'zip' object is not subscriptable

存取MongoDB中的数据

#import pymongo
#con = pymongo.Connection('localhost',port=27017)改了
from pymongo import MongoClient
con = MongoClient('localhost',port=27017)
  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值