首先,去除标点符号,并用空格代替。
接着,用字典匹配,对于没有出现的单词就直接加在里面,以后出现的就直接在dict字典的key+1.
最后,还可以匹配关键字。
def amount0(data):
signal1=dict()
data1=list(data)
except_signal={' ','\n','\u3000','-'}
l=len(data1)
for i in range(l):
if not data1[i].isalnum():
if data[i] in except_signal:
data1[i] = ' '
continue
if not signal1.__contains__(data1[i]):
signal1[data1[i]]=0
else:
signal1[data1[i]]=signal1[data1[i]]+1
data1[i]=' '
print(signal1)
for k, v in signal1.items():
print("%s %s" % (k, v), file=datafile)
data2="".join(data1)
return data2
def amount1(data):
data1=data
words1=data1.split()
words2=list(set(words1))
l2=len(words2)
l1=len(words1)
n=[0]*l2
dictwords=dict(zip(words2,n))
for i in range(l1):
dictwords[words1[i]]=dictwords[words1[i]]+1
for k, v in dictwords.items():
print("%s %s" % (k, v), file=datafile)
print(dictwords)
def amount2(data, keywords):
n=[]
data1=data.replace(",", ' ').replace(";", ' ').replace(".", ' ')
words1=data1.split()
l=len(keywords)
for i in range(l):
n.append(words1.count(keywords[i]))
answer=dict(zip(keywords, n))
print(answer)
for k, v in answer.items():
print("%s %s" % (k, v), file=datafile)
f=open("data.txt", "r", encoding='GBK')
datafile=open("output.txt", "w")
data=f.read().lower()
data1=amount0(data)
amount1(data1)
keywords=["if", "sun", "the", "to", "is"]
amount2(data, keywords)
f.close()