数据集:THUCNews(中)、IMDB数据集(英)
任务描述:基本文本处理技能,中英文字符串处理,分词,词、字符频率统计。语言模型:unigram、bigram、trigram频率统计。
英文数据处理过程详述:
英文库主要使用到nltk库,和其他的一些数据处理库。
#import necessary modules
import re, string, unicodedata
import nltk
import contractions
import inflect
from bs4 import BeautifulSoup
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
#read the data to sample
path = "/home/admin-ygb/Desktop/learning/DataWhale_learning_nlp/data/aclImdb/train/"
pos = os.path.join(path,"pos")
neg = os.path.join(path,"neg")
with open(pos+"/0_9.txt","r") as f:
sample = f.readlines()
sample = inpurt_str[0]
def replace_contractions(text):
"""Replace contractions in string of text"""
return contractions.fix(text)
#还原英文缩写
sample = replace_contractions(sample)
print(sample)
#英文分词,在该阶段也可以进行分句操作
words = nltk.word_tokenize(sample)
print(words)
#
def remove_non_ascii(words):
"""Remove non-ASCII characters from list of tokenized words"""
new_words = []
for word in words:
new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
new_words.append(new_word)
return new_words
def to_lowercase(words):
"""Convert all characters to lowercase from list of tokenized words"""
new_words = []
for word in words:
new_word = word.lower()
new_words.append(new_word)
return new_words
def remove_punctuation(words):
"""Remove punctuation from list of tokenized words"""
new_words = []
for word in words:
new_word = re.sub(r'[^\w\s]', '', word)
if new_word != '':
new_words.append(new_word)
return new_words
def replace_numbers(words):
"""Replace all interger occurrences in list of tokenized words with textual representation"""
p = inflect.engine()
new_words = []
for word in words:
if word.isdigit():
new_word = p.number_to_words(word)
new_words.append(new_word)
else:
new_words.append(word)
return new_words
def remove_stopwords(words):
"""Remove stop words from list of tokenized words"""
new_words = []
for word in words:
if word not in stopwords.words('english'):
new_words.append(word)
return new_words
def stem_words(words):
"""Stem words in list of tokenized words"""
stemmer = LancasterStemmer()
stems = []
for word in words:
stem = stemmer.stem(word)
stems.append(stem)
return stems
def lemmatize_verbs(words):
"""Lemmatize verbs in list of tokenized words"""
lemmatizer = WordNetLemmatizer()
lemmas = []
for word in words:
lemma = lemmatizer.lemmatize(word, pos='v')
lemmas.append(lemma)
return lemmas
def normalize(words):
words = remove_non_ascii(words)
words = to_lowercase(words)
words = remove_punctuation(words)
words = replace_numbers(words)
words = remove_stopwords(words)
return words
#去除数字,符号,停用词等,进行单词标准化
words = normalize(words)
print(words)
def stem_and_lemmatize(words):
stems = stem_words(words)
lemmas = lemmatize_verbs(words)
return stems, lemmas
#获取词根,词元
stems, lemmas = stem_and_lemmatize(words)
print('Stemmed:\n', stems)
print('\nLemmatized:\n', lemmas)
中文数据处理过程详述:
英文库主要使用到jieba库,和其他的一些数据处理库。
#导入必要库
import jieba
import pandas as pd
#读入数据
path = "/home/admin-ygb/Desktop/learning/DataWhale_learning_nlp/data/cnews/"
with open(path+"/cnews.train.txt","r") as f:
sample = []
for i in range(10):
sample.append(f.readline().split("\t"))
df = pd.DataFrame(sample,columns=['类别','文本'])
content = df.文本
#去掉文本中的空格
def process(our_data):
m1 = map(lambda s: s.replace(' ', ''), our_data)
return list(m1)
#print(process(content))
#让文本只保留汉字,删除字符
def is_chinese(uchar):
if uchar >= u'\u4e00' and uchar <= u'\u9fa5':
return True
else:
return False
def format_str(content):
content_str = ''
for i in content:
if is_chinese(i):
content_str = content_str + i
return content_str
chinese_list = []
for line in content:
chinese_list.append(format_str(line))
#print(chinese_list)
#对文本进行jieba分词
import jieba
def fenci(datas):
cut_words = map(lambda s: list(jieba.cut(s)), datas)
return list(cut_words)
fenci_list = fenci(chinese_list)
#print(fenci_list)
#停用词表
stopwords = ['的','呀','这','那','就','的话','如果']
#去掉文本中的停用词
def drop_stopwords(contents, stopwords):
contents_clean = []
for line in contents:
line_clean = []
for word in line:
if word in stopwords:
continue
line_clean.append(word)
contents_clean.append(line_clean)
return contents_clean
#print(drop_stopwords(fenci_list,stopwords))