Python Text Processing with NLTK 2.0 Cookbook中比较有用的代码

有这本书的童鞋能共享一下最新版本么

para="hello world. it's good to see you. thanks for buying this book"
from nltk.tokenize import sent_tokenize
print(sent_tokenize(para))

print("----------------------------")

from nltk.tokenize import word_tokenize
print(word_tokenize('hello world'))

print("----------------------------")

from nltk.tokenize import word_tokenize
print(word_tokenize('你好,我是 自然 语言 处理'))
# 这里如果不加空格的话它就处理不出来,。也就是说是以空格作为切分得依据

print("----------------------------")
import nltk
text="hello, this is my world"
pattern=r"\w+|[^\w\s]+"
# r:regular expression;双引号""可以用单引号''代替;
# \w表示单词字符,等同于字符集合[a-zA-Z0-9_];+表示一次或者多次,等同于{1,},即c+ 和 c{1,} 是一个意思;
# "|":二选一,正则表达式中的"或"; [...]:字符集(字符类),其对应的位置可以是字符集中任意字符,
# 例如,a[bcd]表abe、ace和ade;^表示只匹配字符串的开头;\s匹配单个空格,等同于[\f\n\r\t\v]。
print(nltk.tokenize.regexp_tokenize(text,pattern))

print("--------------以上均为切词的手段--------------")

from nltk.tokenize import PunktSentenceTokenizer
from nltk.corpus import webtext
text=webtext.raw('overheard.txt')
sent_tokenizer=PunktSentenceTokenizer(text)
# print(sent_tokenizer=PunktSentenceTokenizer(text))

sents1=sent_tokenizer.tokenize(text)
print(sents1[0])

from nltk.tokenize import sent_tokenize
sents2=sent_tokenize(text)
print(sents2[1])

print("--------------去除停用词--------------")
from nltk.corpus import stopwords
english_stops=set(stopwords.words('english'))
words=["cant","is","a","constraction"]
sets=[]
for word in words:
    if word not in english_stops:
        sets.append(word)
print(sets)

print("--------------在WORDnet上找同义词--------------")
# way1:
from nltk.corpus import wordnet
syn=wordnet.synsets('cookbook')[0]
print(syn.name())
print(syn.definition())

# way2:
print(syn.name)
print(syn.definition)


print("----------------------------")

from nltk.corpus import wordnet as wn
motorcar=wn.synset('car.n.01')
types_of_motorcar=motorcar.hyponyms()
print(types_of_motorcar)

print("-----------------部分与整体的关系-----------")

print(wn.synset('computer.n.01').part_meronyms())

print("-------------反义词关系---------------")
print(wn.lemma('good.a.01.good').antonyms())

print("---------查看词汇关系和同义词集上定义的其他方法-------------------")
print(dir(wn.synset('beautiful.a.01')))


print("------------pos----------------")
syn=wordnet.synsets('hello')[0]
print(syn.pos())

print("------------查看复数形式和同义词----------------")
print(wn.synset('car.n.01').lemma_names())

print("------------计算同义词的相似度----------------")
# way1: path_similarity  基于上位词层次结构中相互连接的概念之间的最短路径,
# 其值为0-1之间,如果没有路径返回-1
right=wn.synset('right_whale.n.01')
minke=wn.synset('minke_whale.n.01')
print(right.path_similarity(minke))
# way2: wup_similarity  基于同义词在上位树出现的位置进行计算
print(right.wup_similarity(minke))


print("------------相对于n-gram----------------")
from nltk import bigrams
a=r"I'm a girl"
tokens=a.split()
# 这地方一定要加LISt,否则打印不出来
print(list(bigrams(tokens)))


print("----------------词频统计--------------------")

from nltk import FreqDist
# 空格也算
fdist1=FreqDist("a ni n nn n t t m")
print(fdist1)
print(fdist1.most_common(3))

import matplotlib
# fdist1.plot(3,cumulative=True)


print("----------------词干词语--------------------")
# 单个词干 Poter是一种词干提取的算法
from nltk.stem import PorterStemmer
stemmer=PorterStemmer()
print(stemmer.stem('coding'))

# 多个词词干
verbs=['appears', 'appear', 'appeared', 'calling', 'called']
stems=[]
for verb in verbs:
    stemmed_verb=stemmer.stem(verb)
    stems.append(stemmed_verb)
print(sorted((set(stems))))

print("----------------词形还原-------------------")
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()
print(lemmatizer.lemmatize('coding'))
print(lemmatizer.lemmatize('codes'))


print("----------------利用正则表达式进行词语替换词语--------------------")

import re
replacement_patterns = [
(r'won\'t', 'will not'),
(r'can\'t', 'cannot'),
(r'i\'m', 'i am'),
(r'ain\'t', 'is not'),
(r'(\w+)\'ll', '\g<1> will'),
(r'(\w+)n\'t', '\g<1> not'),
(r'(\w+)\'ve', '\g<1> have'),
(r'(\w+)\'s', '\g<1> is'),
(r'(\w+)\'re', '\g<1> are'),
(r'(\w+)\'d', '\g<1> would')
]

class RegexReplacer(object):
    def __init__(self,patterns=replacement_patterns):
        self.patterns=[(re.compile(regex),repl) for (regex,repl) in patterns]

    def replace(self,text):
        s=text
        for (pattern,repl) in self.patterns:
            s=re.sub(pattern,repl,s)
        return s

replacer=RegexReplacer()
print(replacer.replace("You're the world, I'm a girl"))

print("----------------获取语料-------------------")
# 语料库的文件名,平均字长,平均句长,每个词平均出现的次数
from nltk.corpus import gutenberg
for filename in gutenberg.fileids():
    r=gutenberg.raw(filename)
    w=gutenberg.words(filename)
    s=gutenberg.sents(filename)
    v=set(w)
    print(filename,len(r)/len(w),len(w)/len(s),len(w)/len(v))


f=open('hello.txt')
print(f.read())

print("----------------建立语料库,并进行检索-------------------")
# step1:
corps_root='E:/JustForNLP/nltkEx'
from nltk.corpus import PlaintextCorpusReader
wordlist=PlaintextCorpusReader(corps_root,'walden.txt')
print(wordlist.fileids())

wordlists=PlaintextCorpusReader(corps_root,'.*')
print(wordlists.fileids())
import nltk
# step2:
n=nltk.word_tokenize(wordlists.raw(fileids="walden.txt"))
complete_Walden=nltk.Text(n)
print(complete_Walden.concordance("walden"))

print("----------------获取网络文本-------------------")
from urllib.request import urlopen
url='https://blog.csdn.net/u011001084/article/details/78980299'
html=urlopen(url).read()
print(html[:20])


print("----------------tag-------------------")
import nltk
nltk.download('averaged_perceptron_tagger')

text=nltk.word_tokenize("I'm a small girl but the world is big")
print(nltk.pos_tag(text))

 

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
Preface Natural Language Processing is used everywhere—in search engines, spell checkers, mobile phones, computer games, and even in your washing machine. Python's Natural Language Toolkit (NLTK) suite of libraries has rapidly emerged as one of the most efficient tools for Natural Language Processing. You want to employ nothing less than the best techniques in Natural Language Processing—and this book is your answer. Python Text Processing with NLTK 2.0 Cookbook is your handy and illustrative guide, which will walk you through all the Natural Language Processing techniques in a step-by-step manner. It will demystify the advanced features of text analysis and text mining using the comprehensive NLTK suite. This book cuts short the preamble and lets you dive right into the science of text processing with a practical hands-on approach. Get started off with learning tokenization of text. Receive an overview of WordNet and how to use it. Learn the basics as well as advanced features of stemming and lemmatization. Discover various ways to replace words with simpler and more common (read: more searched) variants. Create your own corpora and learn to create custom corpus readers for data stored in MongoDB. Use and manipulate POS taggers. Transform and normalize parsed chunks to produce a canonical form without changing their meaning. Dig into feature extraction and text classification. Learn how to easily handle huge amounts of data without any loss in efficiency or speed. This book will teach you all that and beyond, in a hands-on learn-by-doing manner. Make yourself an expert in using the NLTK for Natural Language Processing with this handy companion. Preface 2 What this book covers Chapter 1, Tokenizing Text and WordNet Basics, covers the basics of tokenizing text and using WordNet. Chapter 2, Replacing and Correcting Words, discusses various word replacement and correction techniques. The recipes cover the gamut of linguistic compression, spelling correction, and text normalization. Chapter 3, Creating Custom Corpora, covers how to use corpus readers and create custom corpora. At the same time, it explains how to use the existing corpus data that comes with NLTK. Chapter 4, Part-of-Speech Tagging, explains the process of converting a sentence, in the form of a list of words, into a list of tuples. It also explains taggers, which are trainable. Chapter 5, Extracting Chunks, explains the process of extracting short phrases from a part-of-speech tagged sentence. It uses Penn Treebank corpus for basic training and testing chunk extraction, and the CoNLL 2000 corpus as it has a simpler and more flexible format that supports multiple chunk types. Chapter 6, Transforming Chunks and Trees, shows you how to do various transforms on both chunks and trees. The functions detailed in these recipes modify data, as opposed to learning from it. Chapter 7, Text Classification, describes a way to categorize documents or pieces of text and, by examining the word usage in a piece of text, classifiers decide what class label should be assigned to it. Chapter 8, Distributed Processing and Handling Large Datasets, discusses how to use execnet to do parallel and distributed processing with NLTK. It also explains how to use the Redis data structure server/database to store frequency distributions. Chapter 9, Parsing Specific Data, covers parsing specific kinds of data, focusing primarily on dates, times, and HTML. Appendix, Penn Treebank Part-of-Speech Tags, lists a table of all the part-of-speech tags that occur in the treebank corpus distributed with NLTK.
http://www.amazon.com/Python-Text-Processing-NLTK-Cookbook/dp/1782167854/ Paperback: 310 pages Publisher: Packt Publishing - ebooks Account (August 26, 2014) Language: English Over 80 practical recipes on natural language processing techniques using Python's NLTK 3.0 About This Book Break text down into its component parts for spelling correction, feature extraction, and phrase transformation Learn how to do custom sentiment analysis and named entity recognition Work through the natural language processing concepts with simple and easy-to-follow programming recipes Who This Book Is For This book is intended for Python programmers interested in learning how to do natural language processing. Maybe you've learned the limits of regular expressions the hard way, or you've realized that human language cannot be deterministically parsed like a computer language. Perhaps you have more text than you know what to do with, and need automated ways to analyze and structure that text. This Cookbook will show you how to train and use statistical language models to process text in ways that are practically impossible with standard programming tools. A basic knowledge of Python and the basic text processing concepts is expected. Some experience with regular expressions will also be helpful. In Detail This book will show you the essential techniques of text and language processing. Starting with tokenization, stemming, and the WordNet dictionary, you'll progress to part-of-speech tagging, phrase chunking, and named entity recognition. You'll learn how various text corpora are organized, as well as how to create your own custom corpus. Then, you'll move onto text classification with a focus on sentiment analysis. And because NLP can be computationally expensive on large bodies of text, you'll try a few methods for distributed text processing. Finally, you'll be introduced to a number of other small but complementary Python libraries for text analysis, cleaning, and parsing. This cookbook provides simple, straightforward examples so you can quickly learn text processing with Python and NLTK.
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值