def fenci(text):
tokens = nltk.word_tokenize(text)
return tokens
def stem_text(text):
ps=nltk.porter.PorterStemmer()
tokenizer=nltk.tokenize.toktok.ToktokTokenizer()
tokens=tokenizer.tokenize(text)
stem_token=[ps.stem(token.strip()) for token in tokens]
text=' '.join(stem_token)
return text
def lemmatize_text(text):
text=nlp(text)
lemma_word=[word.lemma_ if word.lemma_!='-PRON-' else word.text for word in text]
text=' '.join(lemma_word)
return text
def remove_stopwords(text,is_lower_case=False):
tokens=nltk.word_tokenize(text)
tokens=[token.strip() for token in tokens]
# print(tokens)
if is_lower_case:
filtered_tokens=[token for token in tokens if token not in stopword_list]
else:
filtered_tokens=[token for token in tokens if token.lower() not in stopword_list]
filtered_tokens=np.array((filtered_tokens))
#print(filtered_tokens)
return filtered_tokens
def normalize_corpus(corpus,doc_tokenize=True,text_stemming=False,
text_lemmatization=False,
stopword_removal=True,text_lower_case=True):
normalized_corpus=[]
for doc in corpus:
# print("sddddcxcdcs")
if doc_tokenize:
doc=fenci(doc)
if text_stemming:
doc=stem_text(doc)
if text_lemmatization:
doc=lemmatize_text(doc)
if stopword_removal:
doc=remove_stopwords(doc,is_lower_case=text_lower_case)
normalized_corpus.append(doc)
# print(normalized_corpus)
return normalized_corpus
#new3 new5 为文本数据
import pandas as pd
import numpy as np
t5=normalize_corpus(new5,doc_tokenize=False,text_stemming=False,
text_lemmatization=False,
stopword_removal=True,text_lower_case=False)
data5=list(t5)
t3=normalize_corpus(new3,doc_tokenize=False,text_stemming=False,
text_lemmatization=False,
stopword_removal=True,text_lower_case=False)
data3=list(t3)