<strong><span style="font-size:18px;">开通blog第一次发文,稍有不足,仅仅测试之用。这个代码片段是一个目录下所有txt文献(文献由PDF转成txt)中所有单词(长度大于5)个数,和计算单词重复出现个数用的</span></strong>
<span style="font-size:14px;">#!/usr/bin/python
#-*- coding: utf-8 -*-
import os
import sys
import sqlite3
import time
import codecs
class Txt(object):
"""处理相关txt数据"""
def __init__(self):
'''vvvv'''
def deal_txt(self):
temp_dic=[]
dict_account={}
num=0 #每篇文章计算个数
path="C:\\Users\\John\\Desktop\\work\\paper"
for f in os.listdir(path):
if os.path.isfile(path+"\\"+f) and '.txt' in f :
file_name=path+"\\"+f
else:
continue
with codecs.open (file_name,encoding = "utf-8") as ff: #避免编码的问题
s=ff.read()
ff.close()
dele=[' ','!','%','&','*','(',')',');','[',']','{','}','\\','|','/','//','?',':','"','“','”',':',':',';',';',',',',','.','。','`','·','~','-','-','_','+','=','——','-']
#尽可能的去除符号
for d in dele:
s=s.strip(d)
ls=s.split(' ')
ls=ls[1:] #ls[0]包含'u/ffff'字符,去除
for i in ls :
for d in dele:
i=i.strip(d)
#尽可能去除非单词部分
if ' ' in i or len(i)<=4:
continue
v={'ing':'','ies':'y','tions':'tion','\'s':'','ied':'y','ed':'','s':''}
for j in ['ing','ied','ies','tions','ed','\'s','s']:
if i[-len(j):]==j:
i=i.replace(j,v[j])
break #跳出内循环
if len(i)<=5 or len(i)>10 or 'http:' in i or '’s' in i or not i.isalpha() :
continue
try:
print(i)
except:
continue
if i.lower() not in temp_dic:
temp_dic.append(i.lower())
if i.lower() not in dict_account:
time.sleep(0.000001)
dict_account[i.lower()]=1
else:
time.sleep(0.000001)
dict_account[i.lower()]=dict_account[i.lower()]+1
s=''
ls=[]
num=-(num-len(temp_dic))
print(file_name,"\n","本篇文章符合要求单词个数是:%d"%num,len(temp_dic))
print (len(temp_dic))
print(sorted(dict_account.items(),key =lambda a:a[1],reverse=True)) #按值大小排列
return temp_dic
def main():
print('#' * 50)
txt1=Txt()
txt1.deal_txt()
if __name__ == '__main__':
main()
</span>