格式化文本

文本格式化

非英文

去除

英文

大小写、单复数统一(看情况,不是所有的大写都可以变成小写的)

使用TextBlob进行单复数统一

针对引号进行分词,就是说以"xxx"为单位进行处理

大小写

说实话,大小写不能乱改:全部变成小写可能会有问题ToYou变成toyou可能会失去本来的意思

单复数

TextBlob能不能查询是不是名词的复数 然后变成单数 知道是已有的复数就删掉


#!usr/bin/env python
#-*- coding:utf-8 -*-
import math
import os
import glob
import numpy as np
import jieba
import string
import jieba.analyse
from textblob import TextBlob
filename=r"C:/Users/Administrator/Desktop/inx-hashtag.json"
file=open(filename,"rb")
content=file.readlines()
content=" ".join('%s' %id for id in content)
contentfinal=""
length=len(content)
start=0
end=0
strf=""
strd=""
testb=False
def is_number(uchar):
        """判断一个unicode是否是数字"""
        if uchar >= u'\u0030' and uchar<=u'\u0039':
                return True
        else:
                return False



def is_alphabet(uchar):
        """判断一个unicode是否是英文字母"""
        if (uchar >= u'\u0041' and uchar<=u'\u005a') or (uchar >= u'\u0061' and uchar<=u'\u007a'):
                return True
        else:
                return False

for i in range(0,length-1):
    if content[i]=="[":
        start=i
    elif content[i]=="]":
        end=i
        Done=i/length*100
        tempstorelist=[]
        print("Have Done"+str(Done)+"%")
        ids=start-1
        ide=start-1
        sright=False

        while True:
            if content[ids]=="\"" and not sright:
                sright=True
                ids=ids-1
            if content[ids]=="\"" and sright:
                break
            ids=ids-1
        # print("id is "+content[ids:ide])
        doublenum=0
        for ite in range(start,end):
            idds=ite
            idde=ite+1
                
            if content[idds]=="\"":
                doublenum=doublenum+1
                if(doublenum%2==1):
                    ite=ite+1
                    while content[ite]!="\"":
                        ite=ite+1
                    idde=ite
                    if(idde-idds>1):
                        if is_alphabet(content[idds+1:idde]) or is_number(content[idds+1:idde]):
                            # print(content[idds+1:idde])
                            # print("idds"+str(idds)+"idde"+str(idde))
                            # print(content[idds:idde+1])
                            storestr=content[idds:idde+1]
                            blobstr=TextBlob(storestr)
                            repeatnot=blobstr.words[0].singularize()
                            if(repeatnot not in tempstorelist):
                                tempstorelist.append(repeatnot)
                else:
                    continue
            else:
                continue
        
        contentwrite=""
        contentwrite="".join('\"%s\", ' %id for id in tempstorelist)
        contentwrite=contentwrite[:len(contentwrite)-2]+"]"
        contentwrite=content[ids:ide]+" ["+contentwrite[:]+", "
        contentfinal=contentfinal+contentwrite
    else:
        continue
contentfinal="b'{"+contentfinal[:len(contentfinal)-2]+"}'"
file=open(r"C:/Users/Administrator/Desktop/inx-hashtag3.json",'w')
file.write(contentfinal)
file.close

乱码

编码进行区别 即\u开头的这个引号就删掉

#!usr/bin/env python
#-*- coding:utf-8 -*-
import math
import os
import glob
import numpy as np
import jieba
import string
import jieba.analyse
   
filename=r"C:/Users/Administrator/Desktop/inx-hashtag.json"
file=open(filename,"rb")
content=file.readlines()
content=" ".join('%s' %id for id in content)
length=len(content)
file.close
start=0
end=0
strf=""
testb=False
i=0
while True:
    if i>=len(content):
        break
    if content[i]=="\\" and content[i+1]=="u":
        print(i)
        idds=i
        idde=i+1
        while True:
            if content[idds]=="\"":
                idds=idds+1
                break
            idds=idds-1
        while True:
            if content[idde]=="\"":
                break
            idde=idde+1
        content=content[:idds]+content[idde:]
    i=i+1
    print("have done "+str(i/len(content)*100)+"%")
file=open(r"C:/Users/Administrator/Desktop/inx-hashtag.json",'w')
file.write(content)
file.close

词频

#!usr/bin/env python
#-*- coding:utf-8 -*-
# from textblob import TextBlob
#!usr/bin/env python
#-*- coding:utf-8 -*-
import math
import os
import glob
import numpy as np
import jieba
import string
import jieba.analyse
import wordninja
   
filename=r"C:/Users/Administrator/Desktop/inx-hashtag4.json"
file=open(filename,"rb")
content=file.readlines()
content=" ".join('%s' %id for id in content)
length=len(content)
file.close
start=0
end=0
strf=""
contentfinal=""
testb=False
dict={}
i=0
for i in range(0,length-1):
    if content[i]=="[":
        start=i
    elif content[i]=="]":
        end=i
        Done=i/length*100
        tempstorelist=[]
        print("Have Done"+str(Done)+"%")
        ids=start-1
        ide=start-1
        sright=False

        while True:
            if content[ids]=="\"" and not sright:
                sright=True
                ids=ids-1
            if content[ids]=="\"" and sright:
                break
            ids=ids-1
        # print("id is "+content[ids:ide])
        doublenum=0
        for ite in range(start,end):
            idds=ite
            idde=ite+1
                
            if content[idds]=="\"":
                doublenum=doublenum+1
                if(doublenum%2==1):
                    ite=ite+1
                    while content[ite]!="\"":
                        ite=ite+1
                    idde=ite
                    if(idde-idds>1):
                        # print(content[idds+1:idde])
                        # print("idds"+str(idds)+"idde"+str(idde))
                        # print(content[idds:idde+1])
                        storestr=content[idds:idde+1]
                        if(storestr in dict):
                            dict[storestr]=dict[storestr]+1
                        else:
                            dict[storestr]=1
                else:
                    continue
            else:
                continue
        
        # contentwrite=""
        # contentwrite="".join('\"%s\", ' %id for id in tempstorelist)
        # contentwrite=contentwrite[:len(contentwrite)-2]+"]"
        # contentwrite=content[ids:ide]+" ["+contentwrite[:]+", "
        # contentfinal=contentfinal+contentwrite
    else:
        continue
# contentfinal="b'{"+contentfinal[:len(contentfinal)-2]+"}'"
dict=sorted(dict.items(),key=lambda item:item[1],reverse=True)
# dict=list(dict)
# dictfinal="".join('%s' %id for id in dict)
# print(dictfinal)
# for key,value in dict.items():
#     print('{key}:{value}'.format(key = key, value = value))

# dicfinal="".join('%s' %id for id in dict)
# print(dicfinal)
# print(sorted(dict.items(),key=lambda item:item[1],reverse=True))
file=open(r"C:/Users/Administrator/Desktop/count.txt",'w')
file.write(str(dict))
file.close

可以二次遍历。

n-gram

对于多种单词组成的无空格复合单词,进行分解(添加空格)

#!usr/bin/env python
#-*- coding:utf-8 -*-
# from textblob import TextBlob
#!usr/bin/env python
#-*- coding:utf-8 -*-
import math
import os
import glob
import numpy as np
import jieba
import string
import jieba.analyse
import wordninja
   
filename=r"C:/Users/Administrator/Desktop/inx-hashtag1.json"
file=open(filename,"rb")
content=file.readlines()
content=" ".join('%s' %id for id in content)
length=len(content)
file.close
start=0
end=0
strf=""
contentfinal=""
testb=False
i=0
for i in range(0,length-1):
    if content[i]=="[":
        start=i
    elif content[i]=="]":
        end=i
        Done=i/length*100
        tempstorelist=[]
        print("Have Done"+str(Done)+"%")
        ids=start-1
        ide=start-1
        sright=False

        while True:
            if content[ids]=="\"" and not sright:
                sright=True
                ids=ids-1
            if content[ids]=="\"" and sright:
                break
            ids=ids-1
        # print("id is "+content[ids:ide])
        doublenum=0
        for ite in range(start,end):
            idds=ite
            idde=ite+1
                
            if content[idds]=="\"":
                doublenum=doublenum+1
                if(doublenum%2==1):
                    ite=ite+1
                    while content[ite]!="\"":
                        ite=ite+1
                    idde=ite
                    if(idde-idds>1):
                        # print(content[idds+1:idde])
                        # print("idds"+str(idds)+"idde"+str(idde))
                        # print(content[idds:idde+1])
                        storestr=content[idds:idde+1]
                        blobstr=wordninja.split(storestr)
                        repeatnot="".join('%s ' %id for id in blobstr)
                        tempstorelist.append(repeatnot[:len(repeatnot)-1])
                else:
                    continue
            else:
                continue
        
        contentwrite=""
        contentwrite="".join('\"%s\", ' %id for id in tempstorelist)
        contentwrite=contentwrite[:len(contentwrite)-2]+"]"
        contentwrite=content[ids:ide]+" ["+contentwrite[:]+", "
        contentfinal=contentfinal+contentwrite
    else:
        continue
contentfinal="b'{"+contentfinal[:len(contentfinal)-2]+"}'"
# print(contentfinal)
file=open(r"C:/Users/Administrator/Desktop/inx-hashtag4.json",'w')
file.write(contentfinal)
file.close

转载于:https://www.cnblogs.com/harrysong666/p/10448713.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值