#!usr/bin/python
# -*- coding: utf-8 -*-
#coding=utf-8
import sys,os,re
#t=sys.argv[1]
#f=open(t,"r")
#按行序选取特定行
#输入参数 文件标识: f
#输入参数 起始行数: start_line
#输入参数 终止行数: dead_line=0
#输入参数 截取周期: zhouqi
#输入参数 截取周期内行的行标(列表形式输入): list
#输出结果 需要截取的行存入一个列表中,输出此列表: line
def match_by_line(f,zhouqi=1,list=[1],start_line=1,dead_line=0):
flag_j=0
line_storage=[]
line=[]
line_flag=0
for i in f:
line_flag+=1
#i=i.strip("\n|\r")#去掉末尾的换行符
if dead_line!=0:
if ((dead_line-start_line+1)/zhouqi)%2!=1 and ((dead_line-start_line+1)/zhouqi)%2!=0:
break
if dead_line>=line_flag>=start_line:
if flag_j<zhouqi-1:
line_storage.append(i)
flag_j+=1
elif flag_j==zhouqi-1:
line_storage.append(i)
for k in list:
line.append(line_storage[k-1])
line_storage=[]
flag_j=0
if line_flag > dead_line:
break
elif dead_line==0:
if line_flag>=start_line:
if flag_j<zhouqi-1:
line_storage.append(i)
flag_j+=1
elif flag_j==zhouqi-1:
line_storage.append(i)
for k in list:
line.append(line_storage[k-1])
line_storage=[]
flag_j=0
f.close()
return line
'''
f=open("test.txt2","r")
zhouqi=2
list=[1]
start_line=3
dead_line=10
line=match_by_line(f,zhouqi,list,start_line)
for i in line:
print(i,end="")
'''
'''
########## 按字符串匹配结果选取特定行 ##########
输入参数: 要匹配的字符串 str_match
输入参数: 文件标识
输出结果: 需要截取的行存入一个列表中,输出此列表: line
'''
def match_by_str(f,str_match):
pattern=re.compile(str_match)
line=[]
for i in f:
#if str_match in i:
m=pattern.search(i)
while m:
line.append(i)
m=False
return line
'''
f=open("test.txt2","r",encoding='UTF-8')
zhouqi=2
list=[1]
start_line=3
dead_line=10
str_match="201\d\d\d\d\d"
line=match_by_str(f,str_match)
for i in line:
print(i,end="")#文件处理中未去掉换行符则需要如此输出
'''
######## 选取想要的列 ########
#注:此函数适用于linux操作系统,需要命令awk支持
#输入参数: 文件标识 f
#输入参数: 需要获取的列的id的列表 list
#输出结果: 需要截取的列存入一个列表中,输出此列表: line
def get_colunm_awk(f,list):
cum=[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
for i in range(len(list)):
cum[i]=list[i]
os.system("awk '{print $%d\"\t\"$%d\"\t\"$%d\"\t\"$%d\"\t\"$%d\"\t\"$%d\"\t\"$%d\"\t\"$%d\"\t\"$%d\"\t\"$%d\"\t\"$%d\"\t\"$%d\"\t\"$%d\"\t\"$%d\"\t\"$%d\"\t\"$%d\"\t\"$%d\"\t\"$%d\"\t\"$%d\"\t\"$%d}' %s> lingshi.f"%(cum[0],cum[1],cum[2],cum[3],cum[4],cum[5],cum[6],cum[7],cum[8],cum[9],cum[10],cum[11],cum[12],cum[13],cum[14],cum[15],cum[16],cum[17],cum[18],cum[19],f))
nf=open("lingshi.f","r")
line=nf.readlines()
nf.close()
return line
'''
f=sys.argv[1]
list=[1,3]
line=get_colunm_awk(f,list)
for i in line:
print (i,end="")
'''
######## 选取想要的列——python.re ########
#输入参数: 文件标识 f
#输入参数: 分割标识 split_flag
#输入参数: 需要获取的列的id的列表 list
#输出结果: 需要截取的列存入一个列表中,输出此列表: line
def get_colunm_pyre(f,list,split_flag=None):
line=[]
for i in f:
i=i.strip("\n")
m=i.split(split_flag)
#print (m[0],len(m))
for k in range(len(list)):
line.append(m[list[k]-1])
#line.append("\n")
return line
'''
t=sys.argv[1]
f=open(t,"r",encoding='UTF-8')
list=[1]
#split_flag="="
line=get_colunm_pyre(f,list)
p="hello"
flagi=0
for k in line:
if p != k:
if flagi==0:
print("%s\t1"%(k))
flagi+=1
if flagi%61==0:
print ("%s\t%d"%(k,flagi))
p=k
'''
'''
for i in f:
#print(type(i),i)
i=i.strip("\n")
i=i.strip("\t")
try:
i=float(i)
except ValueError:
print (type(i),i)
nian=i//10000;
yue=(i%10000)//100
r=i%100
print("%d\t%d\t%d年%d月"%(nian,yue,nian,yue))
'''
######## 选取想要的字符串并按照理想格式输出 ########
#输入参数:
#first 去掉换行符和空格输出成一行
#输入参数:已经把文件初步处理过的,存入列表line中 的line 若文件不需要处理,请至今调用函数 match_by_line(f)来返回line
def out_first(inf,outf):
f1=open(inf,"rb")
f2=open(outf,"wb")
for i in f1:
t=i.split()
for j in t:
f2.write(j)
f1.close()
f2.close()
#inf=sys.argv[1]
#outf=sys.argv[2]
#out_first(inf,outf)
#second 对某个值进行评估比较后输出
#对某一列的值比较后输出此行
#输入参数:line 此前文件处理函数返回的文件内容的列表
#输入参数:colum_num 被比较行的行数号码
#输入参数:compare_flag 比较符号eg">","<","<=","="
#输入参数:becompared_value 被比较的值
def out_after_compare(line,colum_num,compare_flag,becompared_value):
backline=[]
for i in line:
k=i.split()
if type(becompared_value)==type("a"):
k[colum_num-1]=str(k[colum_num-1])
elif type(becompared_value)==type(1):
k[colum_num-1]=int(k[colum_num-1])
elif type(becompared_value)==type(1.1):
k[colum_num-1]=float(k[colum_num-1])
else:
print ("unsupprot long or complex type, or your becompared_value is unrecognized")
if compare_flag==">":
if k[colum_num-1] > becompared_value:
backline.append(i)
elif compare_flag==">=":
if k[colum_num-1] >= becompared_value:
backline.append(i)
elif compare_flag=="<":
if k[colum_num-1] < becompared_value:
backline.append(i)
elif compare_flag=="<=":
if k[colum_num-1] <= becompared_value:
backline.append(i)
elif compare_flag=="=":
if k[colum_num-1] == becompared_value:
backline.append(i)
return backline
'''
#eg:
t=sys.argv[1]
f=open(t,"r")
line=match_by_line(f)
colum_num=3
compare_flag="="
becompared_value=1576045117499
backline=out_after_compare(line,colum_num,compare_flag,becompared_value)
for i in backline:
print (i,end="")
'''
'''
#big eg
t=sys.argv[1]
f=open(t,"r")
str_match="ST=\d+"
line=match_by_str(f,str_match)
o=open("lingshi.f","w")
for i in line:
o.write(i)
o.close()
o=open("lingshi.f","r")
list=[1]
split_flag="Gateway1,Node=Node1 ST="
backline=get_colunm_pyre(o,list,split_flag)
for i in backline:
print (i,end="")
'''
#对其中某一列进行分列处理
#输入参数: colum_num 需要处理的列的编号,从1开始
#输入参数:line 已经把文件初步处理过的,存入列表line中
#输入参数:split_flag 分割标识
def split_colum(line,colum_num,split_flag):
backline=[]
for i in line:
t=i.split()
k=t[colum_num-1].split(split_flag)
for j in range(len(k)):
t.insert(colum_num-1,k[len(k)-1-j])
t=' '.join(t)
backline.append(t)
backline.extend("\n")
return backline
'''
t=sys.argv[1]
f=open(t,"r")
colum_num=2
split_flag="="
line=match_by_line(f)
backline=split_colum(line,colum_num,split_flag)
for i in backline:
print (i,end="")
'''
#把大文件分割成小文件
#first 按照行数分割
#输入参数:filename 文件名
#输入参数:filename 目标小文件行数
def split_file_by_line(filename,linenum):
f=open(filename,"r")
dir_put='split_dir/'
n=0
i=0
if os.path.isdir(dir_put): # os.path.isdir 判断路径是否为目录
pass
else:
os.mkdir(dir_put) #创建dir_put文件夹
filename_front=os.path.splitext(filename)[0] #取到除去扩展名的文件名 os.path.splitext 分割路径,返回路径名和文件扩展名的元组
temp=open(dir_put+filename_front+'.part'+str(n)+'.txt','w')
count = 0
for index, line in enumerate(f):
count+= 1
f.close()
f=open(filename,"r")
while 1:
p=f.readline()
temp.write(p)
if i==count-1:
print(filename_front+'.part'+str(n)+'.txt')
temp.close()
f.close()
break
elif (i+1)%linenum==0:
print(filename_front+'.part'+str(n)+'.txt')
n+=1
temp.close()
temp=open(dir_put+filename_front+'.part'+str(n)+'.txt','w')
i+=1
'''
t=sys.argv[1]
linenum=int(input("enter size:"))
split_file_by_line(t,linenum)
'''
#second 按照大小分割
#输入参数:filename 文件名
#输入参数:size 目标文件大小,单位为byte
def split_file_by_KB(t,size):
fp=open(t,'rb')
i=0
n=0
dir_put='split_dir/'
if os.path.isdir(dir_put): # os.path.isdir 判断路径是否为目录
pass
else:
os.mkdir(dir_put) #创建dir_put文件夹
filename_front=os.path.splitext(t)[0] #取到除去扩展名的文件名 os.path.splitext 分割路径,返回路径名和文件扩展名的元组
temp=open(dir_put+filename_front+'.part'+str(i)+'.txt','wb')
buf=fp.read(1024) #file.read() 从文件中读取指定的字节数
while 1:
temp.write(buf)
buf=fp.read(1024)
try:
if buf[0]=="":
n+=1
continue
except IndexError:
print (filename_front+'.part'+str(i)+'.txt')
temp.close()
fp.close()
break
n+=1
if n==size:
n=0
print (filename_front+'.part'+str(i)+'.txt')
i+=1
temp.close()
temp=open(dir_put+filename_front+'.part'+str(i)+'.txt','wb')
fp.close()
'''
if __name__=='__main__':
t=sys.argv[1]
size=int(input("enter size:")) #注意转换为int,否则无效
split_file_by_KB(t,size)
'''
'''
统计有多少个大于30的
f=open("Node2_ET_num.all","r")
t=0
a=0
b=0
for i in f:
k=i.split()
k[0]=float(k[0])
k[1]=float(k[1])
k[2]=float(k[2])
if k[0]>=20:
if k[1]!=a and k[2]!=b:
t+=1
a=k[1]
b=k[2]
print (t)
'''
自己写的一些文件处理的函数,记录一下
最新推荐文章于 2023-05-17 18:49:46 发布