第三章 处理原始文本
1 从网络和硬盘访问文本
#<<罪与罚>>的英文翻译 未作测试??
From utlib import urlopen
Url=’http://www.gutenberg.org/files/2554/2554.txt’
Raw=urlopen(url).read()
Type(raw)
Len(raw)
Raw[:75]
#分词 未作测试??
Tokens=nltk.word_tokenize(raw)
Type(tokens)
Len(tokens)
Tokens[:10]
#切片
Text=nltk.Text(tokens)
Type(text)
Text[1020:1060]
Text.collocations()
#手工挑出文本中的描述信息
Raw.find(‘PART I’)#取得字符串索引值
Raw.rfind(“End of Project Gutenberg’s Crime”)
Raw=raw[5303:1157681]
Raw.find(“PART I”)
处理html 未作测试??
Url=’http://news.bbc.co.uk/2/hi/health/2284783.stm’
Html=urlopen(url).read()
Html[:60]
#对html进行分词
Raw=nltk.clean_html(html)
Tokens=nltk.word_tokenize(raw)
tokens
#取得感兴趣的标识符
Tokens=[96:399]
Text=nltk.Text(tokens)
Text.concordance(‘gene’)
处理搜索引擎的结果
略
处理rss订阅 未作测试
Import feedparser
Llog=feedparser.parse(“http://languagelog.ldc.upenn.edu/nll/?feed=atom”)
Llog[‘feed’][‘title’]
Len(llog.entries)
Post=llog.entries[2]
Post.title
Content=post.content[0].value
Content[0:70]
Nltk.word_tokenize(nltk.html_clean(content))
Nltk.word_tokenize(nltk.clean_html(llog.entries[2].content[0].value))
读取本文件
#提示找不到文件
>>> f=open('document.txt')
Traceback (most recent call last):
File "<input>", line 1, in <module>
FileNotFoundError: [Errno 2] No such fileor directory: 'document.txt'
#查看当前目录,在当前目录下添加document.txt文件
>>> import os
>>> os.listdir('.')
['.idea', 'One', 'Two']
#重新打开并读取文件内容
>>> f=open('document.txt')
>>> f.read()
'this is my time\nTime files like anarrow.\nFruit files like a banana.\n'
#一次读取文件中的一行
>>> f=open('document.txt','rU')
>>> for line in f:
... print(line.strip())#删除行尾换行符
...
this is my time
Time files like an arrow.
Fruit files like a banana.
#打开语料库中的文件名
>>> path=nltk.data.find('corpora/gutenberg/melville-moby_dick.txt')
>>> raw=open(path,'rU').read()
从PDF,MS WORD 和其他二进制文件中提取文本
#捕获用户在程序交互时的输入
>>> s=input('Enter some text')
Enter some text>? On an exceptionallyhot eveing early in july
>>> print('You typed',len(nltk.word_tokenize(s)),'words')
You typed 8 words
NLP的流程
>>>raw=open('document.txt').read()
>>> type(raw)
<class 'str'>
#分词
>>> tokens=nltk.word_tokenize(raw)
>>> type(tokens)
<class 'list'>
>>> words=[w.lower() for w intokens]
>>> type(words)
<class 'list'>
>>> vocab=sorted(set(words))
>>> type(vocab)
<class 'list'>
#可以追加一个元素到一个链表,介不能到一个字符串
>>> vocab.append('blog')
>>> raw.append('blog')
Traceback (most recent call last):
File "<input>", line 1, in <module>
AttributeError: 'str' object has noattribute 'append'
#字符串+字符串 链表+链表 链表不能加字符串
>>> query='Who knows?'
>>>beatles=['john','paul','george','ringo']
>>> query+beatles
Traceback (most recent call last):
File "<input>", line 1, in <module>
TypeError: Can't convert 'list' object tostr implicitly
2 字符串:最底层的文本处理
#字符串的基本操作
monty = 'Monty python' print(monty) circus = "Monty python's Flying Circus" print(circus) circus = 'Monty python\'s Flying Circus' print(circus)
#多行文本可以使用反斜杠连接或是括号
>>> couplet="Shall I comparethee to a Summer's day?"\
... "Thou are more lovely and moretemperate:"
>>> print(couplet)
Shall I compare thee to a Summer's day?Thouare more lovely and more temperate:
>>> couplet=("Shall I comparethee to a Summer's day?"
... "Thou are more lovely and moretemperate:")
>>> print(couplet)
Shall I compare thee to a Summer's day?Thouare more lovely and more temperate:
#以上方法显示的字符串没有换行,可以使用三引号
>>> couplet='''Shall I comparethee to a Summer's day?
... Thou are more lovely and moretemperate:'''
>>> print(couplet)
Shall I compare thee to a Summer's day?
Thou are more lovely and more temperate:
>>>couplet="""Shall I compare thee to a Summer's day?
... Thou are more lovely and moretemperate:"""
>>> print(couplet)
Shall I compare thee to a Summer's day?
Thou are more lovely and more temperate:
#字符串的连接操作
>>> 'very'+'very'+'very'
'veryveryvery'
>>> 'very'*3
'veryveryvery'
#输出字符串,使用print()
#访问单个字符串
>>> monty='Monty python'
>>> monty[0]
'M'
#正数为正向索引,负数为反向索引
>>> monty[-1]
'n
#不要在行尾输出换行符 ??
#字符转小写,过滤非字母字符
>>> import nltk
>>> from nltk.corpus importgutenberg
>>>raw=gutenberg.raw('melville-moby_dick.txt')
>>> fdist=nltk.FreqDist(ch.lower()for ch in raw if ch.isalpha())
>>> fdist.keys()
dict_keys(['s', 'z', 'r', 'h', 'a', 'i','n', 'b', 't', 'j', 'o', 'e', 'c', 'm', 'x', 'y', 'g', 'd', 'q', 'v', 'w', 'f','k', 'p', 'u', 'l'])
#访问子字串,类似于对链表的切片操作
>>> monty='Monty python'
>>> monty[6:10]
'pyth'
#使用负数索引
>>> monty[-12:-7]
'Monty'
#分别从字串头和尾开始
>>> monty[:5]
'Monty'
>>> monty[6:]
'python'
#测试字串被包含
>>> phrase='And now for something completelydifferent'
>>> if 'thing' in phrase:
... print("found 'thing")
...
found 'thing
#使用find()查找子字串位
>>> monty.find('python')
6
#查看字符串的更多操作
>>> help(str)
Help on class str in module builtins:
class str(object)
| str(object='') -> str
| str(bytes_or_buffer[,encoding[, errors]]) -> str
|
| Create a new string object from the given object. If encoding or
| errors is specified, then the object must expose a data buffer
| thatwill be decoded using the given encoding and error handler.
| Otherwise, returns the result of object.__str__() (if defined)
| orrepr(object).
| encoding defaults to sys.getdefaultencoding().
| errors defaults to 'strict'.
|
| Methods defined here:
|
| __add__(self, value, /) #私有方法
| Return self+value.
|
| __contains__(self, key, /)
| Return key in self.
|
| __eq__(self, value, /)
| Return self==value.
|
| __format__(...)
| S.__format__(format_spec) -> str
|
| Return a formatted version of S as described by format_spec.
|
| __ge__(self, value, /)
| Return self>=value.
|
| __getattribute__(self, name, /)
| Return getattr(self, name).
|
| __getitem__(self, key, /)
| Return self[key].
|
| __getnewargs__(...)
|
| __gt__(self, value, /)
| Return self>value.
|
| __hash__(self, /)
| Return hash(self).
|
| __iter__(self, /)
| Implement iter(self).
|
| __le__(self, value, /)
| Return self<=value.
|
| __len__(self, /)
| Return len(self).
|
| __lt__(self, value, /)
| Return self<value.
|
| __mod__(self, value, /)
| Return self%value.
|
| __mul__(self, value, /)
| Return self*value.n
|
| __ne__(self, value, /)
| Return self!=value.
|
| __new__(*args, **kwargs) from builtins.type
| Create and return a new object. See help(type) for accurate signature.
|
| __repr__(self, /)
| Return repr(self).
|
| __rmod__(self, value, /)
| Return value%self.
|
| __rmul__(self, value, /)
| Return self*value.
|
| __sizeof__(...)
| S.__sizeof__() -> size of S in memory, in bytes
|
| __str__(self, /)
| Return str(self).
|
| capitalize(...)
| S.capitalize() -> str
|
| Return a capitalized version of S, i.e. make the first character
| have upper case and the rest lower case.
|
| casefold(...)
| S.casefold() -> str
|
| Return a version of S suitable for caseless comparisons.
|
| center(...)
| S.center(width[, fillchar]) -> str
|
| Return S centered in a string of length width. Padding is
| done using the specified fill character (default is a space)
|
| count(...) #字符串中字符数量
| S.count(sub[, start[, end]]) -> int
|
| Return the number of non-overlapping occurrences of substring sub in
| string S[start:end]. Optionalarguments start and end are
| interpreted as in slice notation.
|
| encode(...)
| S.encode(encoding='utf-8', errors='strict') -> bytes
|
| Encode S using the codec registered for encoding. Default encoding
| is 'utf-8'. errors may be given to set a different error
| handling scheme. Default is 'strict' meaning that encoding errors raise
| aUnicodeEncodeError. Other possible values are 'ignore', 'replace' and
| 'xmlcharrefreplace' as well as any other name registered with
| codecs.register_error that can handle UnicodeEncodeErrors.
|
| endswith(...) #是否以指定字符串结尾
| S.endswith(suffix[, start[, end]]) -> bool
|
| Return True if S ends with the specified suffix, False otherwise.
| With optional start, test S beginning at that position.
| With optional end, stop comparing S at that position.
| suffix can also be a tuple of strings to try.
|
| expandtabs(...)
| S.expandtabs(tabsize=8) -> str
|
| Return a copy of S where all tab characters are expanded using spaces.
| If tabsize is not given, a tab size of 8 characters is assumed.
|
| find(...) #查找子字串的第一个索引
| S.find(sub[, start[, end]]) -> int
|
| Return the lowest index in S where substring sub is found,
| such that sub is contained within S[start:end]. Optional
| arguments start and end are interpreted as in slice notation.
|
| Return -1 on failure.
|
| format(...) #格式化字串
| S.format(*args, **kwargs) -> str
|
| Return a formatted version of S, using substitutions from args andkwargs.
| The substitutions are identified by braces ('{' and '}').
|
| format_map(...)
| S.format_map(mapping) -> str
|
| Return a formatted version of S, using substitutions from mapping.
| The substitutions ar