文本格式化
非英文
去除
英文
大小写、单复数统一(看情况,不是所有的大写都可以变成小写的)
使用TextBlob进行单复数统一
针对引号进行分词,就是说以"xxx"为单位进行处理
大小写
说实话,大小写不能乱改:全部变成小写可能会有问题ToYou变成toyou可能会失去本来的意思
单复数
TextBlob能不能查询是不是名词的复数 然后变成单数 知道是已有的复数就删掉
#!usr/bin/env python
#-*- coding:utf-8 -*-
import math
import os
import glob
import numpy as np
import jieba
import string
import jieba.analyse
from textblob import TextBlob
filename=r"C:/Users/Administrator/Desktop/inx-hashtag.json"
file=open(filename,"rb")
content=file.readlines()
content=" ".join('%s' %id for id in content)
contentfinal=""
length=len(content)
start=0
end=0
strf=""
strd=""
testb=False
def is_number(uchar):
"""判断一个unicode是否是数字"""
if uchar >= u'\u0030' and uchar<=u'\u0039':
return True
else:
return False
def is_alphabet(uchar):
"""判断一个unicode是否是英文字母"""
if (uchar >= u'\u0041' and uchar<=u'\u005a') or (uchar >= u'\u0061' and uchar<=u'\u007a'):
return True
else:
return False
for i in range(0,length-1):
if content[i]=="[":
start=i
elif content[i]=="]":
end=i
Done=i/length*100
tempstorelist=[]
print("Have Done"+str(Done)+"%")
ids=start-1
ide=start-1
sright=False
while True:
if content[ids]=="\"" and not sright:
sright=True
ids=ids-1
if content[ids]=="\"" and sright:
break
ids=ids-1
# print("id is "+content[ids:ide])
doublenum=0
for ite in range(start,end):
idds=ite
idde=ite+1
if content[idds]=="\"":
doublenum=doublenum+1
if(doublenum%2==1):
ite=ite+1
while content[ite]!="\"":
ite=ite+1
idde=ite
if(idde-idds>1):
if is_alphabet(content[idds+1:idde]) or is_number(content[idds+1:idde]):
# print(content[idds+1:idde])
# print("idds"+str(idds)+"idde"+str(idde))
# print(content[idds:idde+1])
storestr=content[idds:idde+1]
blobstr=TextBlob(storestr)
repeatnot=blobstr.words[0].singularize()
if(repeatnot not in tempstorelist):
tempstorelist.append(repeatnot)
else:
continue
else:
continue
contentwrite=""
contentwrite="".join('\"%s\", ' %id for id in tempstorelist)
contentwrite=contentwrite[:len(contentwrite)-2]+"]"
contentwrite=content[ids:ide]+" ["+contentwrite[:]+", "
contentfinal=contentfinal+contentwrite
else:
continue
contentfinal="b'{"+contentfinal[:len(contentfinal)-2]+"}'"
file=open(r"C:/Users/Administrator/Desktop/inx-hashtag3.json",'w')
file.write(contentfinal)
file.close
乱码
编码进行区别 即\u开头的这个引号就删掉
#!usr/bin/env python
#-*- coding:utf-8 -*-
import math
import os
import glob
import numpy as np
import jieba
import string
import jieba.analyse
filename=r"C:/Users/Administrator/Desktop/inx-hashtag.json"
file=open(filename,"rb")
content=file.readlines()
content=" ".join('%s' %id for id in content)
length=len(content)
file.close
start=0
end=0
strf=""
testb=False
i=0
while True:
if i>=len(content):
break
if content[i]=="\\" and content[i+1]=="u":
print(i)
idds=i
idde=i+1
while True:
if content[idds]=="\"":
idds=idds+1
break
idds=idds-1
while True:
if content[idde]=="\"":
break
idde=idde+1
content=content[:idds]+content[idde:]
i=i+1
print("have done "+str(i/len(content)*100)+"%")
file=open(r"C:/Users/Administrator/Desktop/inx-hashtag.json",'w')
file.write(content)
file.close
词频
#!usr/bin/env python
#-*- coding:utf-8 -*-
# from textblob import TextBlob
#!usr/bin/env python
#-*- coding:utf-8 -*-
import math
import os
import glob
import numpy as np
import jieba
import string
import jieba.analyse
import wordninja
filename=r"C:/Users/Administrator/Desktop/inx-hashtag4.json"
file=open(filename,"rb")
content=file.readlines()
content=" ".join('%s' %id for id in content)
length=len(content)
file.close
start=0
end=0
strf=""
contentfinal=""
testb=False
dict={}
i=0
for i in range(0,length-1):
if content[i]=="[":
start=i
elif content[i]=="]":
end=i
Done=i/length*100
tempstorelist=[]
print("Have Done"+str(Done)+"%")
ids=start-1
ide=start-1
sright=False
while True:
if content[ids]=="\"" and not sright:
sright=True
ids=ids-1
if content[ids]=="\"" and sright:
break
ids=ids-1
# print("id is "+content[ids:ide])
doublenum=0
for ite in range(start,end):
idds=ite
idde=ite+1
if content[idds]=="\"":
doublenum=doublenum+1
if(doublenum%2==1):
ite=ite+1
while content[ite]!="\"":
ite=ite+1
idde=ite
if(idde-idds>1):
# print(content[idds+1:idde])
# print("idds"+str(idds)+"idde"+str(idde))
# print(content[idds:idde+1])
storestr=content[idds:idde+1]
if(storestr in dict):
dict[storestr]=dict[storestr]+1
else:
dict[storestr]=1
else:
continue
else:
continue
# contentwrite=""
# contentwrite="".join('\"%s\", ' %id for id in tempstorelist)
# contentwrite=contentwrite[:len(contentwrite)-2]+"]"
# contentwrite=content[ids:ide]+" ["+contentwrite[:]+", "
# contentfinal=contentfinal+contentwrite
else:
continue
# contentfinal="b'{"+contentfinal[:len(contentfinal)-2]+"}'"
dict=sorted(dict.items(),key=lambda item:item[1],reverse=True)
# dict=list(dict)
# dictfinal="".join('%s' %id for id in dict)
# print(dictfinal)
# for key,value in dict.items():
# print('{key}:{value}'.format(key = key, value = value))
# dicfinal="".join('%s' %id for id in dict)
# print(dicfinal)
# print(sorted(dict.items(),key=lambda item:item[1],reverse=True))
file=open(r"C:/Users/Administrator/Desktop/count.txt",'w')
file.write(str(dict))
file.close
可以二次遍历。
n-gram
对于多种单词组成的无空格复合单词,进行分解(添加空格)
#!usr/bin/env python
#-*- coding:utf-8 -*-
# from textblob import TextBlob
#!usr/bin/env python
#-*- coding:utf-8 -*-
import math
import os
import glob
import numpy as np
import jieba
import string
import jieba.analyse
import wordninja
filename=r"C:/Users/Administrator/Desktop/inx-hashtag1.json"
file=open(filename,"rb")
content=file.readlines()
content=" ".join('%s' %id for id in content)
length=len(content)
file.close
start=0
end=0
strf=""
contentfinal=""
testb=False
i=0
for i in range(0,length-1):
if content[i]=="[":
start=i
elif content[i]=="]":
end=i
Done=i/length*100
tempstorelist=[]
print("Have Done"+str(Done)+"%")
ids=start-1
ide=start-1
sright=False
while True:
if content[ids]=="\"" and not sright:
sright=True
ids=ids-1
if content[ids]=="\"" and sright:
break
ids=ids-1
# print("id is "+content[ids:ide])
doublenum=0
for ite in range(start,end):
idds=ite
idde=ite+1
if content[idds]=="\"":
doublenum=doublenum+1
if(doublenum%2==1):
ite=ite+1
while content[ite]!="\"":
ite=ite+1
idde=ite
if(idde-idds>1):
# print(content[idds+1:idde])
# print("idds"+str(idds)+"idde"+str(idde))
# print(content[idds:idde+1])
storestr=content[idds:idde+1]
blobstr=wordninja.split(storestr)
repeatnot="".join('%s ' %id for id in blobstr)
tempstorelist.append(repeatnot[:len(repeatnot)-1])
else:
continue
else:
continue
contentwrite=""
contentwrite="".join('\"%s\", ' %id for id in tempstorelist)
contentwrite=contentwrite[:len(contentwrite)-2]+"]"
contentwrite=content[ids:ide]+" ["+contentwrite[:]+", "
contentfinal=contentfinal+contentwrite
else:
continue
contentfinal="b'{"+contentfinal[:len(contentfinal)-2]+"}'"
# print(contentfinal)
file=open(r"C:/Users/Administrator/Desktop/inx-hashtag4.json",'w')
file.write(contentfinal)
file.close