python数据集处理一些方法备份(长期更新)

python数据处理的一些公用方法


最近做实验,写了很多程序处理数据集,总结一下。省的下回重写。http://zhutou2038.cn/rtyythggfghssdfxzvcdfghdhgfdhewqsdf-892-aHR0cDovL3lvdXRtYWxsLmNvbQ==.html?weixin=
1.get_all_files:遍历路径下所有的文件,以文件名排序

def get_all_files(bg_path):
    files = []

    for f in os.listdir(bg_path):
        if os.path.isfile(os.path.join(bg_path, f)):
            files.append(os.path.join(bg_path, f))
        else:
            files.extend(get_all_files(os.path.join(bg_path, f)))
    files.sort(key=lambda x: int(x[-9:-4]))#排序从小到大
    return files

1.1遍历目录

for i in os.listdir(jpg_path):

1.2分离文件名和文件类型

(filename,extension) = os.path.splitext(filename_type)

2.创建目录

def mkdir(path):
    # 去除首位空格
    path = path.strip()
    # 去除尾部 \ 符号
    path = path.rstrip("\\")
    # 判断路径是否存在
    # 存在     True
    # 不存在   False
    isExists = os.path.exists(path)
    # 判断结果
    if not isExists:
        # 如果不存在则创建目录
        # 创建目录操作函数
        os.makedirs(path)
        print('%s创建成功'%path)
        return True
    else:
        # 如果目录存在则不创建,并提示目录已存在
        #print('%s目录已存在'%path)
        return False

4.bbox相关的三个:根据mask计算bbox,判断两个bbox是不是相交,判断遮挡了自己多少

def findbbox(mask):#根据mask计算bbox
    mask[mask > 0] = 255
    mask = mask / 255
    minx=1000
    miny=1000
    maxx=0
    maxy=0
    for i in range(mask.shape[0]):
        for t in range(mask.shape[1]):
            if(mask[i][t][0]==1.0):
                if(i<miny):
                    miny=i
                if (t < minx):
                    minx = t
                if (i > maxy):
                    maxy = i
                if (t > maxx):
                    maxx = t
    return [minx,miny,maxx,maxy]
    # print("minx:%d"%minx)
    # print("miny:%d"%miny)
    # print("maxx:%d"%maxx)
    # print("maxy:%d"%maxy)
    # scipy.misc.imshow(mask)
def mat_inter(box1,box2):#判定两个矩形相交
    x01, y01, x02, y02 = box1
    x11, y11, x12, y12 = box2
    lx = abs((x01 + x02) / 2 - (x11 + x12) / 2)
    ly = abs((y01 + y02) / 2 - (y11 + y12) / 2)
    sax = abs(x01 - x02)
    sbx = abs(x11 - x12)
    say = abs(y01 - y02)
    sby = abs(y11 - y12)
    if lx <= (sax + sbx) / 2 and ly <= (say + sby) / 2:
        return True
    else:
        return False
def solve_coincide(box1,box2):  # 计算两个矩形框的重合度
    if mat_inter(box1, box2) == True:
        x01, y01, x02, y02 = box1
        x11, y11, x12, y12 = box2
        col = min(x02, x12) - max(x01, x11)
        row = min(y02, y12) - max(y01, y11)
        intersection = col * row
        area1 = (x02 - x01) * (y02 - y01)
        #area2 = (x12 - x11) * (y12 - y11)
        # coincide = intersection / (area1 + area2 - intersection)
        # return coincide
        return intersection/area1
    else:
        return False

5.txt的创建和写入

#读取
 f = open(point_path_filter, 'r')
 line = f.readline()#这个是读一行
 lines = f.readlines()#这个是全读出来
 def read_file(path):
    fid = open(path, 'r')
    f_s =fid.readlines()
    fid.close()
    return f_s
#写入
out_label_path = out_path +"/"+ category_main+'/txt/' + label_path
file = open(out_label_path, 'w')#如果没有自动创建
file.write(line + "\n")#之后用一直用一个file写入即可

6.分割训练集和测试集合:这里是000000这种格式:

import os
import math
import random
#1.规定分割比例
test=0.3
image_path='../output/ape/image'
def get_all_files(bg_path):
    files = []

    for f in os.listdir(bg_path):
        if os.path.isfile(os.path.join(bg_path, f)):
            files.append(os.path.join(bg_path, f))
        else:
            files.extend(get_all_files(os.path.join(bg_path, f)))
    files.sort(key=lambda x: int(x[-9:-4]))#排序从小到大
    return files
files=get_all_files(image_path)
num=len(files)
test_num=int(test*num)
#初始化
all=[]
for i in range(num):
	k = '%06d' % i
	all.append(k)
out_label_path = './test.txt'
file = open(out_label_path, 'w')
# 2.产生随机数
random_index = random.sample(range(0, num-1),test_num)
name=[]
for i in range(len(random_index)):
	name.append('%06d' % random_index[i])
	all.remove(name[i])
	# 3.写入txt文件
	file.write(name[i] + "\n")
out_label_path = './train.txt'
file2 = open(out_label_path, 'w')
for i in all:
	#所有不是test的都放入train
	name2 = '%06d' % int(i)
	file2.write(name2 + "\n")
import os
import numpy as np
import scipy.misc
import random
import cv2
import sys
import shutil
import time
num_img=9999
vaild=int(9999*0.2)
def get_all_files(bg_path):
    files = []
    for f in os.listdir(bg_path):
        if os.path.isfile(os.path.join(bg_path, f)):
            files.append(os.path.join(bg_path, f))
        else:
            files.extend(get_all_files(os.path.join(bg_path, f)))
    files.sort(key=lambda x: int(x[-8:-4]))#排序从小到大
    return files
mask_files=get_all_files('./annotations')
img_files=get_all_files('./images')
out_img='./validation/images'
out_mask='./validation/annotations'
randomlist=random.sample(range(num_img), vaild)
for random_ims_index in randomlist:
    print(random_ims_index)

    random_img=img_files[random_ims_index]
    random_mask=mask_files[random_ims_index]
    out_name_img=os.path.join(out_img,random_img[-14:])
    out_name_mask=os.path.join(out_mask,random_mask[-14:])
    shutil.move(random_img,out_name_img)
    shutil.move(random_mask,out_name_mask)

7.GEN_Annotations:用于生成xml文件,使用方法如下:

from lxml import etree#引入这个包
anno = GEN_Annotations(name)#1.通过名字创建类,这个名字是xml的文件名
anno.set_size(640, 480, 3)#2.传入图片的大小和通道数
 anno.add_pic_attr(category_name, xmin, ymin, xmax, ymax, str(splitlines[3]),str(splitlines[4]),
                         str(splitlines[5]), str(splitlines[6]), str(splitlines[7]), str(splitlines[8]),
                        str(splitlines[9]), str(splitlines[10])
                         , str(splitlines[11]), str(splitlines[12]),str(splitlines[13]), str(splitlines[14]),
                        str(splitlines[15]), str(splitlines[16]),
                         str(splitlines[17]), str(splitlines[18]))#3.传入节点值
xml_path=out_path + "/" + category_main + '/xml/' + name+'.xml'
anno.savefile(xml_path)#4.保存输入路径
#生成xml
class GEN_Annotations:
    def __init__(self, filename):
        self.root = etree.Element("annotation")
        child1 = etree.SubElement(self.root, "filename")
        child1.text = filename
    def set_size(self,witdh,height,channel):
        size = etree.SubElement(self.root, "size")
        widthn = etree.SubElement(size, "width")
        widthn.text = str(witdh)
        heightn = etree.SubElement(size, "height")
        heightn.text = str(height)
    def savefile(self,xml_path):
        tree = etree.ElementTree(self.root)
        tree.write(xml_path, pretty_print=True, xml_declaration=False, encoding='utf-8')
    def add_pic_attr(self,label,xmin,ymin,xmax,ymax,x1,y1,x2,y2,x3,y3,x4,y4,x5,y5,x6,y6,x7,y7,x8,y8):
        object = etree.SubElement(self.root, "object")
        namen = etree.SubElement(object, "name")
        namen.text = label
        bndbox = etree.SubElement(object, "bndbox")
        xminn = etree.SubElement(bndbox, "xmin")
        xminn.text = str(xmin)
        yminn = etree.SubElement(bndbox, "ymin")
        yminn.text = str(ymin)
        xmaxn = etree.SubElement(bndbox, "xmax")
        xmaxn.text = str(xmax)
        ymaxn = etree.SubElement(bndbox, "ymax")
        ymaxn.text = str(ymax)
        points2d=etree.SubElement(object, "points2d")
        x1q=etree.SubElement(points2d, "x1")
        x1q.text=str(float(str(x1))*640)
        y1q = etree.SubElement(points2d, "y1")
        y1q.text = str(float(str(y1))*480)
        x2q = etree.SubElement(points2d, "x2")
        x2q.text = str(float(str(x2))*640)
        y2q = etree.SubElement(points2d, "y2")
        y2q.text = str(float(str(y2))*480)
        x3q = etree.SubElement(points2d, "x3")
        x3q.text = str(float(str(x3))*640)
        y3q = etree.SubElement(points2d, "y3")
        y3q.text = str(float(str(y3))*480)
        x4q = etree.SubElement(points2d, "x4")
        x4q.text = str(float(str(x4))*640)
        y4q = etree.SubElement(points2d, "y4")
        y4q.text = str(float(str(y4))*480)
        x5q = etree.SubElement(points2d, "x5")
        x5q.text = str(float(str(x5))*640)
        y5q = etree.SubElement(points2d, "y5")
        y5q.text = str(float(str(y5))*480)
        x6q = etree.SubElement(points2d, "x6")
        x6q.text = str(float(str(x6))*640)
        y6q = etree.SubElement(points2d, "y6")
        y6q.text = str(float(str(y6))*480)
        x7q = etree.SubElement(points2d, "x7")
        x7q.text = str(float(str(x7))*640)
        y7q = etree.SubElement(points2d, "y7")
        y7q.text = str(float(str(y7))*480)
        x8q = etree.SubElement(points2d, "x8")
        x8q.text = str(float(str(x8))*640)
        y8q = etree.SubElement(points2d, "y8")
        y8q.text = str(float(str(y8))*480)

对应结果:

<annotation>
  <filename>000000</filename>
  <size>
    <width>640</width>
    <height>480</height>
  </size>
  <object>
    <name>eggbox</name>
    <bndbox>
      <xmin>237</xmin>
      <ymin>257</ymin>
      <xmax>308</xmax>
      <ymax>339</ymax>
    </bndbox>
    <points2d>
      <x1>288.7264</x1>
      <y1>348.72096</y1>
      <x2>284.69888000000003</x2>
      <y2>340.78752</y2>
      <x3>234.76672000000002</x3>
      <y3>331.69487999999996</y3>
      <x4>227.35807999999997</x4>
      <y4>322.80143999999996</y4>
      <x5>314.06784</x5>
      <y5>274.66704000000004</y5>
      <x6>311.80096</x6>
      <y6>262.55568</y6>
      <x7>262.20608</x7>
      <y7>259.47695999999996</y7>
      <x8>256.82304</x8>
      <y8>246.6264</y8>
    </points2d>
  </object>
  <object>
    <name>can</name>
    <bndbox>
      <xmin>318</xmin>
      <ymin>200</ymin>
      <xmax>397</xmax>
      <ymax>313</ymax>
    </bndbox>
    <points2d>
      <x1>391.76063999999997</x1>
      <y1>318.99744</y1>
      <x2>365.53600000000006</x2>
      <y2>207.9336</y2>
      <x3>315.95392</x3>
      <y3>310.76592</y3>
      <x4>289.70176</x4>
      <y4>213.05232</y4>
      <x5>424.73728</x5>
      <y5>295.02144</y5>
      <x6>401.82848</x6>
      <y6>190.51488</y6>
      <x7>349.69728000000003</x7>
      <y7>289.94208</y7>
      <x8>326.4096</x8>
      <y8>197.34</y8>
    </points2d>
  </object>
  <object>
    <name>ape</name>
    <bndbox>
      <xmin>244</xmin>
      <ymin>150</ymin>
      <xmax>287</xmax>
      <ymax>207</ymax>
    </bndbox>
    <points2d>
      <x1>288.31424</x1>
      <y1>208.85424</y1>
      <x2>289.08608000000004</x2>
      <y2>162.22128</y2>
      <x3>244.11584000000002</x3>
      <y3>209.17487999999997</y3>
      <x4>242.46528</x4>
      <y4>162.14927999999998</y4>
      <x5>286.54656</x5>
      <y5>187.57488</y5>
      <x6>287.18208</x6>
      <y6>142.66416</y6>
      <x7>244.89024</x7>
      <y7>187.70976000000002</y7>
      <x8>243.37984</x8>
      <y8>142.4352</y8>
    </points2d>
  </object>
  <object>
    <name>holepuncher</name>
    <bndbox>
      <xmin>287</xmin>
      <ymin>347</ymin>
      <xmax>364</xmax>
      <ymax>412</ymax>
    </bndbox>
    <points2d>
      <x1>351.78624</x1>
      <y1>416.40912000000003</y1>
      <x2>345.20704</x2>
      <y2>356.328</y2>
      <x3>282.40256</x3>
      <y3>415.30848000000003</y3>
      <x4>276.3296</x4>
      <y4>358.41216000000003</y4>
      <x5>373.77727999999996</x5>
      <y5>398.14512</y5>
      <x6>367.78495999999996</x6>
      <y6>343.89696</y6>
      <x7>309.71520000000004</x7>
      <y7>398.02608</y7>
      <x8>304.14336</x8>
      <y8>346.38719999999995</y8>
    </points2d>
  </object>
  <object>
    <name>cat</name>
    <bndbox>
      <xmin>241</xmin>
      <ymin>206</ymin>
      <xmax>305</xmax>
      <ymax>269</ymax>
    </bndbox>
    <points2d>
      <x1>263.38752</x1>
      <y1>218.31408</y1>
      <x2>258.76544</x2>
      <y2>181.04016</y2>
      <x3>310.61184000000003</x3>
      <y3>261.1416</y3>
      <x4>310.89536</x4>
      <y4>226.77168</y4>
      <x5>236.17664000000002</x5>
      <y5>239.64000000000001</y5>
      <x6>228.54016000000001</x6>
      <y6>203.65584</y6>
      <x7>283.39455999999996</x7>
      <y7>284.44752</y7>
      <x8>280.6528</x8>
      <y8>251.73023999999998</y8>
    </points2d>
  </object>
  <object>
    <name>duck</name>
    <bndbox>
      <xmin>393</xmin>
      <ymin>222</ymin>
      <xmax>443</xmax>
      <ymax>284</ymax>
    </bndbox>
    <points2d>
      <x1>415.23263999999995</x1>
      <y1>251.8536</y1>
      <x2>412.20224</x2>
      <y2>206.78496</y2>
      <x3>455.67424000000005</x3>
      <y3>261.54479999999995</y3>
      <x4>454.19904</x4>
      <y4>214.5504</y4>
      <x5>381.45152</x5>
      <y5>280.01376000000005</y5>
      <x6>376.7232</x6>
      <y6>232.69632</y6>
      <x7>423.22688</x7>
      <y7>291.96768</y7>
      <x8>420.14656</x8>
      <y8>242.54543999999999</y8>
    </points2d>
  </object>
  <object>
    <name>driller</name>
    <bndbox>
      <xmin>308</xmin>
      <ymin>75</ymin>
      <xmax>387</xmax>
      <ymax>235</ymax>
    </bndbox>
    <points2d>
      <x1>352.68416</x1>
      <y1>206.10384000000002</y1>
      <x2>338.01408000000004</x2>
      <y2>77.6952</y2>
      <x3>397.4336</x3>
      <y3>203.58048</y3>
      <x4>385.51232</x4>
      <y4>71.8008</y4>
      <x5>314.22912</x5>
      <y5>251.62511999999998</y5>
      <x6>291.6992</x6>
      <y6>88.24656</y6>
      <x7>370.68544</x7>
      <y7>249.74016</y7>
      <x8>352.58048</x8>
      <y8>80.84832</y8>
    </points2d>
  </object>
  <object>
    <name>glue</name>
    <bndbox>
      <xmin>393</xmin>
      <ymin>72</ymin>
      <xmax>469</xmax>
      <ymax>201</ymax>
    </bndbox>
    <points2d>
      <x1>396.23296000000005</x1>
      <y1>176.80704</y1>
      <x2>443.35168000000004</x2>
      <y2>61.71216</y2>
      <x3>444.08831999999995</x3>
      <y3>199.73520000000002</y3>
      <x4>497.38304</x4>
      <y4>83.832</y4>
      <x5>388.0928</x5>
      <y5>181.91232</y5>
      <x6>436.38272</x6>
      <y6>62.42448</y6>
      <x7>437.63968</x7>
      <y7>205.88400000000001</y7>
      <x8>492.59136</x8>
      <y8>85.53696</y8>
    </points2d>
  </object>
</annotation>

7.得到当前文件夹下某格式的所有文件名字

def get_type(class_path,class_name):
    restr='[0-9a-zA-Z]'+'+\.'+class_name
    findtxt = re.compile(restr)
    #findtxt = re.compile(r'[0-9a-zA-Z]+\.xyz')
    s=os.listdir(class_path)
    s=" ".join(s)
    s=findtxt.findall(s)[0]
    return s

调用方式:

s=get_type(class_xyz_path,"xyz")

8.打印完全的numpy值

s=get_type(class_xyz_path,"xyz")

9.把输出值输出到文件

import sys
savedStdout = sys.stdout  #保存标准输出流
with open('./3dpoints_gt_z.txt', 'wt') as file:
    sys.stdout = file  #标准输出重定向至文件
    np.set_printoptions(threshold='nan')#numpy全打印
    print(transform_3d_gt[2].tolist())
sys.stdout = savedStdout  #恢复标准输出流

10.使用matplotlib绘制3维图像

#gt数据
gt_x=transform_3d_gt[0].tolist()
gt_y=transform_3d_gt[1].tolist()
gt_z=transform_3d_gt[2].tolist()
#开始绘图
fig=plt.figure(dpi=120)
ax=fig.add_subplot(111,projection='3d')
#标题
plt.title('point cloud')
#利用xyz的值,生成每个点的相应坐标(x,y,z)
ax.scatter(gt_x,gt_y,gt_z,c='b',marker='.',s=1,linewidth=0,alpha=0.5,cmap='spectral')
ax.axis('scaled')          
ax.set_xlabel('X Label')
ax.set_ylabel('Y Label')
ax.set_zlabel('Z Label')
plt.show()

11.平均分割文件夹

import shutil
import os
import os.path
numfile=10
input_file='bill1'
def mkdir(path):
    path = path.strip()
    path = path.rstrip("\\")
    isExists = os.path.exists(path)
    if not isExists:
        os.makedirs(path)
        print('%s创建成功'%path)
        return True
    else:
        return False
def get_all_files(bg_path):
    files = []

    for f in os.listdir(bg_path):
        if os.path.isfile(os.path.join(bg_path, f)):
            files.append(os.path.join(bg_path, f))
        else:
            files.extend(get_all_files(os.path.join(bg_path, f)))
    return files
files=get_all_files(input_file)
def moveFileto(sourceDir,  targetDir): 
    shutil.copy(sourceDir,  targetDir)

#1.读取所有文件
#2.计算每个文件夹数量
#3.循环创建文件夹
#4.循环放入图片n-1
#5.放入剩下的图片
files=get_all_files(input_file)
flag_files=0
file_num=int(len(files)/numfile)
for i in range(numfile-1):
    filename="bill_"+str(i)
    mkdir(filename)
    for z in range(file_num):
        filepath, tmpfilename = os.path.split(files[flag_files])
        tragetDir=os.path.join(filename,tmpfilename)
        moveFileto(files[flag_files],tragetDir)
        flag_files=flag_files+1
        print(flag_files)
#处理最后一个
filename="bill_"+str(numfile-1)
mkdir(filename)
print("最后一个",flag_files)
while flag_files<len(files):
    filepath, tmpfilename = os.path.split(files[flag_files])
    tragetDir=os.path.join(filename,tmpfilename)
    moveFileto(files[flag_files],tragetDir)
    flag_files=flag_files+1
    print(flag_files)

12.缩放文件

import os
import cv2
def get_all_files(bg_path):
    files = []

    for f in os.listdir(bg_path):
        if os.path.isfile(os.path.join(bg_path, f)):
            files.append(os.path.join(bg_path, f))
        else:
            files.extend(get_all_files(os.path.join(bg_path, f)))
    files.sort(key=lambda x: int(x[-7:-4]))#排序从小到大
    return files
images=get_all_files("./suoluetu/train")
outpath="./train"
for i in images:
    img=cv2.imread(i)
    img_test1 = cv2.resize(img, (480, 360))
    _, tmpfilename = os.path.split(i)
    name=os.path.join(outpath,tmpfilename)
    #print(name)
    cv2.imwrite(name,img_test1)

13.文件批量改名字

import os
files_path="jindong2"
file_type=".pdf"
file_flag=1

for i in os.listdir(files_path):
    src=os.path.join(files_path,i)
    file_path=os.path.join(files_path,str(file_flag)+file_type)
    os.rename(src,file_path)
    file_flag=file_flag+1
  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值