1.bs4 和nltk 简单处理html文档
from bs4 import BeautifulSoup import nltk import urllib2 response = urllib2.urlopen('http://python.org/') html = response.read() #获取html 原始内容 #去除html标签 soup = BeautifulSoup(html,'lxml') clean_html = soup.get_text() #词频统计 word_freq = nltk.FreqDist(clean_html) #注意此方法返回的是一个FreqDist对象,而不是list #可以使用dir(word_freq)查看属性 #word_freq.keys(),word_freq.values()查看键值.
2.解析json文件
import json j_file = open('test.json') data = json.load(j_file) print data.name
3.从文本中分离英文句子和词汇并且返回列表
from nltk.tokenize import sent_tokenize from nltk.tokenize import word_tokenize from nltk.tokenize import regexp_tokenize, wordpunct_tokenize, blankline_tokenize str='hello world. wo you are smart!! ha ha it is cool.' #分句 test = sent_tokenize(str) print test #分词 str1="hi everyone ! hello gr8" print word_tokenize(str1) #正则匹配分词 regexp_tokenize(str1,pattern='\w+') #输出['hi','everyone','hello','gr8'] regexp_tokenize(str1,pattern='\d+') #输出['8']
4.词干提取
from nltk.stem import PorterStemmer pst = PorterStemmer() pst.stem('shopping') #输出shop
5.停用词
from nltk.corpus import stopwords stoplist = stopwords.words('english') str = "this is just a test" text = [word for v in text.split() if v not in stoplist] #输出['test']
6.词性标注并且对一些属性分类(地点,人物,组织等)
import nltk from nltk import ne_chunk s=['Mark','was','watching','TV','at','Stanford','University','in','California'] print nltk.pos_tag(s) print ne_chunk(nltk.pos_tag(s),binary=False) #binary为False时该方法就会提供具体的人名,位置,或组织的信息。为True时会给出整个句子的树状图
7.提取一段文本的实体词语
import nltk from nltk import stopwords,ne_chunk from nltk.tokenize import sent_tokenize,word_tokenize from nltk.corpus import stopwords file = open('test.txt') text = file.read() sent = nltk.sent_tokenize(text) word = [nltk.word_tokenize(sentence) for sentence in sent] stoplist = stopwords.words('english') #过滤停用词 word1 = [[word for word in row if word not in stoplist] for row in word] #词性标注 tag_word = [nltk.pos_tag(row) for row in word1] #属性分类 for row in tag_word: print nltk.ne_chunk(row)