用python读取多种格式文件

博客提及了多种文件格式，包括Json、Excel、CSV、txt、mgf、xml等，还涉及写Excel操作，同时给出了相关参考链接，主要围绕文件读取相关内容。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

Json：

use_time=[]
with open(address,'r') as f: #ubuntu
    mobile = json.load(f)
    calls = mobile["transactions"][0]["calls"]
for call in calls: 
  use_time.append(str(call['use_time']))

Excel:

rawdata1=open_workbook(address)
rawdata=rawdata1.sheet_by_index(0)
for i in range(1,rawdata.nrows):
    if rawdata.cell(i,date_index).value=="": #跳过空行
        continue
    else:
        if ctype==3:  #若为3，则用datetime模块处理日期
            date1=rawdata.cell(i,date_index).value
            date2 = xldate_as_tuple(date1,0) 
            date3=datetime(*date2)
            if "." in str(rawdata.cell(i,phone_index).value):
                phone1=str(rawdata.cell(i,phone_index).value)[:-2]  
            else:
                phone1=str(rawdata.cell(i,phone_index).value)

import numpy as np
import xlrd   #使用库函数

workbook = xlrd.open_workbook('C:/users/lenovo/desktop/student_score.xlsx')  #读取路径
sheet = workbook.sheet_by_name('Sheet1')     #读取excel中的第一个sheet

data_name = sheet.col_values(0)    #按列读取，读取第一列
#data_name1 = sheet.row_values(0)  #按行读取，读取第一行
data_st_ID = sheet.col_values(1)
data_st_score = sheet.col_values(2)

写EXCEL：

Excel_file = xlwt.Workbook() 
sheet = Excel_file.add_sheet('sheet0')
header=[u'号码','日期top1','日期top2','日期top3']
#写入标题行：
for i in range(len(header)):
    sheet.write(0,i,header[i])
#开始按行写入数据：
for i in range(len(phonelist)):
    sheet.write(i+1,0,phonelist[i])
    sheet.write(i+1,1,dic[str(phonelist[i])])
#保存EXCEL：
Excel_file.save("C:/Users/Desktop/100个文件输出xls/"+str(fileName)+".xls")

CSV：

rawdata=pd.read_csv(address,skip_blank_lines=True) #参数为去除空行
if 'start_time' or 'begin_time'  in rawdata.columns:
    if 'start_time' in rawdata.columns:
        start_time=rawdata['start_time']
    elif 'begin_time' in rawdata.columns:
            start_time=rawdata['begin_time']

txt:

rawdata=open(address,'r')
i=0
a=[] #c存放第一行的列名
for line in rawdata:
    if i==1: #默认第二行开始存储通话数据
        a=line.split(',') #逗号作为分隔符
        for j in range(len(a)): #查找指定列名所在的列下标
            if (('-' in str(a[j]))or('/' in str(a[j]))): #判断日期所在列数
                date_index=j #保存日期的列下标
            elif  str(a[j]).isdigit() and len(str(a[j]))>5: #默认全为数字组成的字符串为电话号码
                phone_index=j
            else:
                pass
        break
    else:
        i+=1
i=0
for line in rawdata:#开始转存数据：
    if len(line)<10: #跳过空行
        continue
    data_line=line.split(',') #txt默认以','分隔数据
    if i==0:
        pass #第一行为列名，跳过
        i+=1
    else: #从第二行开始保存数据
        start_time.append(data_line[date_index])

mgf

from pyteomics import mgf
from pyteomics import mgf
 
for spectrum in mgf.read("test.mgf"):
    print (spectrum)
    params = spectrum.get('params')
    key = list(params)[3]
    print(key)
    title = params.get('title')
    seq = params.get('seq')
 
print (params)
print (title)
print (seq)

xml

# -*- coding:gb2312 -*-
# coding = utf-8
from pylab import *
import  xml.dom.minidom
def read_xml():
    dom = xml.dom.minidom.parse('abc.xml')#打开xml文档
    cc=dom.getElementsByTagName('caption')
    list_str = [] #字符串
    for item in cc:
        list_str.append(str(item.firstChild.data))

    bb = dom.getElementsByTagName('maxid')
    list_fig = []
    for item in bb:
        list_fig.append(item.firstChild.data)
    su = list_fig[0].encode("gbk")
    list_fig2 = su.split(",")
    list_fig_num = []
    for i in list_fig2:
        list_fig_num.append(int(i))

    ee = dom.getElementsByTagName('time')
    list_tim = []
    for item in ee:
        list_tim.append(item.firstChild.data)
    sg = list_tim[0].encode("gbk")
    list_time = sg.split(",")

    gg = dom.getElementsByTagName('font_size')
    g1 = []
    for item in gg:
        g1.append(item.firstChild.data)
    su = g1[0].encode("gbk")
    return list_str,list_fig_num,list_time,su

在这里插入图片描述
参考：
https://blog.csdn.net/CrazyTTT/article/details/79663895
https://blog.csdn.net/Mr_Cat123/article/details/84857268
https://blog.csdn.net/Z_shoushow/article/details/83022661
https://blog.csdn.net/yiweiyi329/article/details/78184226