NLP TASK2 特征提取

49 篇文章 1 订阅
4 篇文章 0 订阅

数据集:THUCNews(中)、IMDB数据集(英)

任务描述:基本文本处理技能,中英文字符串处理,分词,词、字符频率统计。语言模型:unigram、bigram、trigram频率统计。

英文数据处理过程详述:

英文库主要使用到nltk库,和其他的一些数据处理库。

#import necessary modules
import re, string, unicodedata
import nltk
import contractions
import inflect
from bs4 import BeautifulSoup
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer

#read the data to sample
path = "/home/admin-ygb/Desktop/learning/DataWhale_learning_nlp/data/aclImdb/train/"
pos = os.path.join(path,"pos")
neg = os.path.join(path,"neg")

with open(pos+"/0_9.txt","r") as f:
    sample = f.readlines()
    sample = inpurt_str[0]

def replace_contractions(text):
    """Replace contractions in string of text"""
    return contractions.fix(text)
#还原英文缩写
sample = replace_contractions(sample)
print(sample)

#英文分词,在该阶段也可以进行分句操作
words = nltk.word_tokenize(sample)
print(words)

#
def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = replace_numbers(words)
    words = remove_stopwords(words)
    return words
#去除数字,符号,停用词等,进行单词标准化
words = normalize(words)
print(words)

def stem_and_lemmatize(words):
    stems = stem_words(words)
    lemmas = lemmatize_verbs(words)
    return stems, lemmas

#获取词根,词元
stems, lemmas = stem_and_lemmatize(words)
print('Stemmed:\n', stems)
print('\nLemmatized:\n', lemmas)

中文数据处理过程详述:

英文库主要使用到jieba库,和其他的一些数据处理库。

#导入必要库
import jieba
import pandas as pd

#读入数据
path = "/home/admin-ygb/Desktop/learning/DataWhale_learning_nlp/data/cnews/"
with open(path+"/cnews.train.txt","r") as f:
    sample = []
    for i in range(10):
        sample.append(f.readline().split("\t"))
df = pd.DataFrame(sample,columns=['类别','文本'])
content = df.文本

#去掉文本中的空格
def process(our_data):
    m1 = map(lambda s: s.replace(' ', ''), our_data)
    return list(m1)
#print(process(content))

#让文本只保留汉字,删除字符
def is_chinese(uchar):
    if uchar >= u'\u4e00' and uchar <= u'\u9fa5':
        return True
    else:
        return False

def format_str(content):
    content_str = ''
    for i in content:
        if is_chinese(i):
            content_str = content_str +return content_str

chinese_list = []
for line in content:
    chinese_list.append(format_str(line))
#print(chinese_list)

#对文本进行jieba分词
import jieba
def fenci(datas):
    cut_words = map(lambda s: list(jieba.cut(s)), datas)
    return list(cut_words)

fenci_list = fenci(chinese_list)
#print(fenci_list)

#停用词表
stopwords = ['的','呀','这','那','就','的话','如果']
#去掉文本中的停用词
def drop_stopwords(contents, stopwords):
    contents_clean = []
    for line in contents:
        line_clean = []
        for word in line:
            if word in stopwords:
                continue
            line_clean.append(word)
        contents_clean.append(line_clean)
    return contents_clean

#print(drop_stopwords(fenci_list,stopwords))

小结,本次主要学习中英文数据的基本处理,后续可以根据处理好的数据进行简单的特征表示,比如tf-idf等基于频数的特征表示方式,或者word2vec等embedding的方法进行特征表示。

参考资料

https://github.com/fxsjy/jieba

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值