词干提取
import nltk.stem.porter as pt
import nltk.stem.lancaster as lc
import nltk.stem.snowball as sb
# 波特词干提取器 (偏宽松)
stemmer = pt.PorterStemmer()
# 朗卡斯特词干提取器 (偏严格)
stemmer = lc.LancasterStemmer()
# 思诺博词干提取器 (偏中庸)
stemmer = sb.SnowballStemmer('english')
r = stemmer.stem('playing') # 词干提取
词性还原
与词干提取作用类似, 次干提取出的词干信息不利于人工二次处理(人读不懂), 词性还原可以把名词复数等形式恢复为单数形式. 更有利于人工二次处理.
import nltk.stem as ns
# 词性还原器
lemmatizer = ns.WordNetLemmatizer()
n_lemm=lemmatizer.lemmatize(word, pos='n')
v_lemm=lemmatizer.lemmatize(word, pos='v')
案例:词干提取
"""
词干提取器
"""
import nltk.stem.porter as pt
import nltk.stem.lancaster as lc
import nltk.stem.snowball as sb
words = ['table', 'probably', 'wolves',
'playing', 'is', 'the', 'beaches',
'grouded', 'dreamt', 'envision']
pt_stemmer = pt.PorterStemmer()
lc_stemmer = lc.LancasterStemmer()
sb_stemmer = sb.SnowballStemmer('english')
for word in words:
pt_stem = pt_stemmer.stem(word)
lc_stem = lc_stemmer.stem(word)
sb_stem = sb_stemmer.stem(word)
print('%8s %8s %8s %8s' % \
(word, pt_stem, lc_stem, sb_stem))
提取的结果:
table tabl tabl tabl
probably probabl prob probabl
wolves wolv wolv wolv
playing play play play
is is is is
the the the the
beaches beach beach beach
grouded groud groud groud
dreamt dreamt dreamt dreamt
envision envis envid envis
案例:词性还原
"""
词性还原
"""
import nltk.stem as ns
import nltk
nltk.download('wordnet')
words = ['table', 'probably', 'wolves',
'playing', 'is', 'the', 'beaches',
'grouded', 'dreamt', 'envision']
lemmatizer = ns.WordNetLemmatizer()
for word in words:
n_lemm = lemmatizer.lemmatize(word,pos='n')
v_lemm = lemmatizer.lemmatize(word,pos='v')
print('%8s %8s %8s' % \
(word, n_lemm, v_lemm))
如下是词性还原的结果:
table table table
probably probably probably
wolves wolf wolves
playing playing play
is is be
the the the
beaches beach beach
grouded grouded grouded
dreamt dreamt dream
envision envision envision