所有程序都要处理输入和输出,处理不同类型的文件,包括文本和二
进制文件,文件编码和其他相关的内容。对文件名和目录的操作也会涉及到。
写文本数据
#!usr/bin/python3
# Read the entire file as a single string
with open('somefile.txt', 'rt') as f:
data = f.read()
print('*' * 50)
# Iterate over the lines of the file
with open('somefile.txt', 'rt') as f:
print(type(f))
for file in f:
print(file)
写文本数据
import re
def count_lines1():
# print the lines of file
with open('somefile.txt', 'rt') as f:
data = f.read()
lst = data.split('\n')
return len(lst)
def count_lines2():
with open('somefile.txt', 'rt') as f:
data = f.read()
# lst = re.findall(r'[\w]*', data)
lst = re.split(r'[\n]', data)
return len(lst)
def wt1():
# Write chunks of text data
with open('somefile.txt', 'wt') as f:
f.write(text0 + '\n')
f.write(text1 + '\n')
f.write(text2 + '\n')
f.write(text3 + '\n')
f.write(text4 + '\n')
def wt2():
# Redirected print statement
with open('somefile.txt', 'wt') as f:
print(text0, file=f)
print(text1, file=f)
print(text2, file=f)
print(text3, file=f)
print(text4, file=f)
def read_various(filename):
'handle various data type'
# 文件的读写操作默认使用系统编码,可以通过调用 sys.getdefaultencoding() 来得到
# 几个常见的编码是 ascii, latin-1, utf-8 和 utf-16。
# 在 web 应用程序中通常都使用的是 UTF-8。
# 当读取一个未知编码的文本时使用 latin-1 编码永远不会产生解码错误
new_f = open('new_test.txt', 'wt', encoding='latin-1')
with open(filename, 'rt', encoding='latin-1') as f:
for i, line in enumerate(f, 1):
# 就算读出来乱码,写进入依然完整
print(i, line)
new_f.write(line)
if __name__ == '__main__':
text0 = '卞之琳'
text1 = '你在窗口看风景'
text2 = '我在桥上看你'
text3 = '明月装饰了你的窗子'
text4 = '你装饰了别人的梦'
# wt2()
# print(count_lines2())
read_various('somefile.txt')
windows和Unix中文件不同
'''在 Unix 和 Windows 中是不一样的 (分别
是 \n 和 \r\n )。默认情况下,Python 会以统一模式处理换行符。这种模式下,在读
取文本的时候,Python 可以识别所有的普通换行符并将其转换为单个 \n 字符。类似
的,在输出时会将换行符 \n 转换为系统默认的换行符。:'''
# 如果你不希望这种默认的处理方式,可以给 open() 函数传入参数 newline='' ,就像下面这样
def read_no_newline(filename):
# Read with disable newline translation
with open(filename, 'rt', newline='') as f:
data = f.read()
print(data)
# 读取一个win上的文件,content is "hello world!\r\n"
def read_win(filename):
# Newline translation enabled (the default)
with open(filename, 'rt') as f:
print(f.read())
# Newline translation disable
with open(filename, 'rt', newline='') as g:
print(g.read())
'''
通过:set encoding查看编码encoding=euc-cn
这个通过命令行比较好观察
>>> file = 'hello.txt'
>>> f = open(file, 'rt')
>>> f.read()
'hello world!\n'
>>> f.close()
>>> g = open(file, 'rt', newline='')
>>> g.read()
'hello world!\r\n'
>>>
'''
# 编码或解码错误
def produce_error(filename):
f = open(filename, 'rt', encoding='ascii')
print(f.read())
f.close()
def handle_error(filename):
# Replace bad chars with Unicode U+fffd replacement char
f = open(filename, 'rt', encoding='ascii', errors='replace')
print(f.read())
f.close()
print('*' * 100)
# Ignore bad chars entirely
g = open(filename, 'rt', encoding='ascii', errors='ignore')
print(g.read())
g.close()
print('*' * 100)
b = open(filename, 'rt', encoding='latin-1')
print(b.read())
b.close()
if __name__ == '__main__':
'''对
于文本处理的首要原则是确保你总是使用的是正确编码。当模棱两可的时候,就使用默
认的设置 (通常都是 UTF-8)。'''
read_win('hello.txt')
# produce_error('sample')
# handle_error('sample')
pass