#!/usr/bin/env python # -*- coding: utf-8 -*- #@function delete depublicattions: #1.delete depublicate lines in text import sys from optparse import OptionParser def readfile(filename): try: f = open(filename) except Exception : print ("No such file") exit(0) text = f.readlines() f.close() for i in range(0,len(text)-1): text[i] = text[i][:-1] return text def unique(arr): arr1 = list(set(arr)) arr1.sort(key = arr.index) return arr1 def main(): parser = OptionParser() parser.add_option("-f", "--file", dest="filename",help="write report to FILE", metavar="FILE") #文件路径 #edit configurations--script parameters -f --file C:\Users\llfang1\Desktop\gongsi资料\词条5.txt (options, args) = parser.parse_args() filename = options.filename text = readfile(filename) text_dealed = unique(text) for i in range(0,len(text_dealed)-1): text_dealed[i] = text_dealed[i] +'\n' f = open("output.txt","w") f.writelines(text_dealed) f.close() deduplication_num = len(text) - len(text_dealed) print ("success") print ("The num of data from the source file :" + str(len(text))) print ("The num of data from the preprocessed file: :" + str(len(text_dealed))) print ("The num of data removed :" + str(deduplication_num)) if __name__ == '__main__': main()
删除文件中重复的词语
最新推荐文章于 2021-03-26 23:42:14 发布