#! /usr/bin/python
# -*- coding:utf-8 -*-
'''
Created on 2014-2-12
@author: Java
'''
import re
import fileinput
import time
import os
class ReadFile():
def __init(self,filePath):
'''
__init__方法在类的一个对象被建立时,马上运行。
'''
self.filePath = filePath
def readFileByLine(self,filePath):
'''
按行读取文件,将内容存入一个list,将list转化为String 实现java中StringBuffer的效果 最后返回内容字符串
一行一行得从文件读数据,显然比较慢;不过很省内存。
:param filePath:
:param lineList:
'''
self.filePath = filePath
file = open(filePath)
lineList=[]
while 1:
line = file.readline()
if not line :
break
line =line.strip()
lineList.append(line)
fileContent = ''.join(lineList)
return fileContent
#
def readFileByFileInput(self,filePath):
'''
用fileinput模块,写法简单一些,不过测试以后发现每秒只能读13000行数据,效率比上一种方法慢了两倍多……
:param filePath:
'''
self.filePath = filePath
lineList=[]
for line in fileinput.input(filePath):
line = line.strip()
lineList.append(line)
fileContent = ''.join(lineList)
return fileContent
def readFileByBuffer(self,filePath):
'''
带缓存的文件读写 它每秒可以读96900行数据!效率是第一种方法的3倍,第二种方法的7倍!
:param filePath:
:param lineList:
'''
self.filePath = filePath
file = open(filePath)
while 1:
lines = file.readlines(100000)
if not lines:
break
lineList=[]
for line in lines:
line =line.strip()
lineList.append(line)
fileContent = ''.join(lineList)
return fileContent
def getContent(self,listPath=[]):
readFile =ReadFile()
result = []
for path in listPath:
content = readFile.readFileByBuffer(path)# 获取文件内容
tupleStr = (path,content) #定义一个字符串元组(即java中的数组)#path:filePath+fileName content :内容
result.append(tupleStr)
# dict=(path,content) #定义一个字符串字典(即java中的map)#path:filePath+fileName content :内容
# result.append(tupleStr)
return result
def getFilePathList(self,fileListPath,startPath):
readFile = ReadFile()
if os.path.isdir(startPath):
# 如果是文件夹,则遍历该文件夹下的所有子文件
listSub = os.listdir(startPath)
for sub in listSub:
subDir = os.path.join(startPath,sub)
readFile.getFilePathList(fileListPath, subDir)
else:
# 如果不是文件夹,则将该文件放入list
fileListPath.append(startPath)
return fileListPath
def readFileToDB(self,conn,tableName,filePath):
pass
def readFileToLocalSave(self,savePath,filePath):
pass
if __name__=='__main__':
readFile = ReadFile()
# lineList=[]
# startTime = time.clock()
# # print readFile.readFileByBuffer(r'F:\pythonTest\readFile\1.html',lineList)#0.000361693996309
# # print readFile.readFileByFileInput(r'F:\pythonTest\readFile\1.html',lineList)#0.000520244789212
# print readFile.readFileByLine(r'F:\pythonTest\readFile\1.html',lineList)#0.000401662008687
#
# endTime= time.clock()
# runTime = endTime - startTime
# print runTime
# line=readFile.readFileByLine(r'F:\pythonTest\readFile\1.html',lineList)
# # str =''.join(lineList)
# dr =re.compile(r'<[^>]+>',re.S)
# line =re.sub(dr,'',line)
# print line
# listDict=readFile.getContent(['F://pythonTest//readFile//1.html','F://pythonTest//readFile//2.html'])
# for dic in listDict:
# print dic[1]
fileListPath=[]
fileListPath=readFile.getFilePathList(fileListPath, 'F:\\pythonTest\\readFile\\')
Content = readFile.getContent(fileListPath)
print Content
# -*- coding:utf-8 -*-
'''
Created on 2014-2-12
@author: Java
'''
import re
import fileinput
import time
import os
class ReadFile():
def __init(self,filePath):
'''
__init__方法在类的一个对象被建立时,马上运行。
'''
self.filePath = filePath
def readFileByLine(self,filePath):
'''
按行读取文件,将内容存入一个list,将list转化为String 实现java中StringBuffer的效果 最后返回内容字符串
一行一行得从文件读数据,显然比较慢;不过很省内存。
:param filePath:
:param lineList:
'''
self.filePath = filePath
file = open(filePath)
lineList=[]
while 1:
line = file.readline()
if not line :
break
line =line.strip()
lineList.append(line)
fileContent = ''.join(lineList)
return fileContent
#
def readFileByFileInput(self,filePath):
'''
用fileinput模块,写法简单一些,不过测试以后发现每秒只能读13000行数据,效率比上一种方法慢了两倍多……
:param filePath:
'''
self.filePath = filePath
lineList=[]
for line in fileinput.input(filePath):
line = line.strip()
lineList.append(line)
fileContent = ''.join(lineList)
return fileContent
def readFileByBuffer(self,filePath):
'''
带缓存的文件读写 它每秒可以读96900行数据!效率是第一种方法的3倍,第二种方法的7倍!
:param filePath:
:param lineList:
'''
self.filePath = filePath
file = open(filePath)
while 1:
lines = file.readlines(100000)
if not lines:
break
lineList=[]
for line in lines:
line =line.strip()
lineList.append(line)
fileContent = ''.join(lineList)
return fileContent
def getContent(self,listPath=[]):
readFile =ReadFile()
result = []
for path in listPath:
content = readFile.readFileByBuffer(path)# 获取文件内容
tupleStr = (path,content) #定义一个字符串元组(即java中的数组)#path:filePath+fileName content :内容
result.append(tupleStr)
# dict=(path,content) #定义一个字符串字典(即java中的map)#path:filePath+fileName content :内容
# result.append(tupleStr)
return result
def getFilePathList(self,fileListPath,startPath):
readFile = ReadFile()
if os.path.isdir(startPath):
# 如果是文件夹,则遍历该文件夹下的所有子文件
listSub = os.listdir(startPath)
for sub in listSub:
subDir = os.path.join(startPath,sub)
readFile.getFilePathList(fileListPath, subDir)
else:
# 如果不是文件夹,则将该文件放入list
fileListPath.append(startPath)
return fileListPath
def readFileToDB(self,conn,tableName,filePath):
pass
def readFileToLocalSave(self,savePath,filePath):
pass
if __name__=='__main__':
readFile = ReadFile()
# lineList=[]
# startTime = time.clock()
# # print readFile.readFileByBuffer(r'F:\pythonTest\readFile\1.html',lineList)#0.000361693996309
# # print readFile.readFileByFileInput(r'F:\pythonTest\readFile\1.html',lineList)#0.000520244789212
# print readFile.readFileByLine(r'F:\pythonTest\readFile\1.html',lineList)#0.000401662008687
#
# endTime= time.clock()
# runTime = endTime - startTime
# print runTime
# line=readFile.readFileByLine(r'F:\pythonTest\readFile\1.html',lineList)
# # str =''.join(lineList)
# dr =re.compile(r'<[^>]+>',re.S)
# line =re.sub(dr,'',line)
# print line
# listDict=readFile.getContent(['F://pythonTest//readFile//1.html','F://pythonTest//readFile//2.html'])
# for dic in listDict:
# print dic[1]
fileListPath=[]
fileListPath=readFile.getFilePathList(fileListPath, 'F:\\pythonTest\\readFile\\')
Content = readFile.getContent(fileListPath)
print Content