python自然语言处理学习笔记三

第三章 处理原始文本

 

1 从网络和硬盘访问文本

#<<罪与罚>>的英文翻译 未作测试??

From utlib import urlopen

Url=’http://www.gutenberg.org/files/2554/2554.txt’

Raw=urlopen(url).read()

Type(raw)

Len(raw)

Raw[:75]

 

#分词 未作测试??

Tokens=nltk.word_tokenize(raw)

Type(tokens)

Len(tokens)

Tokens[:10]

 

#切片

Text=nltk.Text(tokens)

Type(text)

Text[1020:1060]

Text.collocations()

 

#手工挑出文本中的描述信息

Raw.find(‘PART I’)#取得字符串索引值

Raw.rfind(“End of Project Gutenberg’s Crime”)

Raw=raw[5303:1157681]

Raw.find(“PART I”)

 

处理html  未作测试??

Url=’http://news.bbc.co.uk/2/hi/health/2284783.stm’

Html=urlopen(url).read()

Html[:60]

 

#对html进行分词

Raw=nltk.clean_html(html)

Tokens=nltk.word_tokenize(raw)

tokens

 

 

#取得感兴趣的标识符

Tokens=[96:399]

Text=nltk.Text(tokens)

Text.concordance(‘gene’)

 

处理搜索引擎的结果

 

处理rss订阅  未作测试

Import feedparser

Llog=feedparser.parse(“http://languagelog.ldc.upenn.edu/nll/?feed=atom”)

Llog[‘feed’][‘title’]

Len(llog.entries)

Post=llog.entries[2]

Post.title

Content=post.content[0].value

Content[0:70]

Nltk.word_tokenize(nltk.html_clean(content))

Nltk.word_tokenize(nltk.clean_html(llog.entries[2].content[0].value))

 

读取本文件

#提示找不到文件

>>> f=open('document.txt')

Traceback (most recent call last):

 File "<input>", line 1, in <module>

FileNotFoundError: [Errno 2] No such fileor directory: 'document.txt'

#查看当前目录,在当前目录下添加document.txt文件

>>> import os

>>> os.listdir('.')

['.idea', 'One', 'Two']

 

#重新打开并读取文件内容

>>> f=open('document.txt')

>>> f.read()

'this is my time\nTime files like anarrow.\nFruit files like a  banana.\n'

 

 

#一次读取文件中的一行

>>> f=open('document.txt','rU')

>>> for line in f:

...    print(line.strip())#删除行尾换行符

...

this is my time

Time files like an arrow.

Fruit files like a  banana.

 

#打开语料库中的文件名

>>> path=nltk.data.find('corpora/gutenberg/melville-moby_dick.txt')

>>> raw=open(path,'rU').read()

 

 

从PDF,MS WORD 和其他二进制文件中提取文本

 

#捕获用户在程序交互时的输入

>>> s=input('Enter some text')

Enter some text>? On an exceptionallyhot eveing early in july

>>> print('You typed',len(nltk.word_tokenize(s)),'words')

You typed 8 words

 

NLP的流程

>>>raw=open('document.txt').read()

>>> type(raw)

<class 'str'>

 

#分词

>>> tokens=nltk.word_tokenize(raw)

>>> type(tokens)

<class 'list'>

>>> words=[w.lower() for w intokens]

>>> type(words)

<class 'list'>

>>> vocab=sorted(set(words))

>>> type(vocab)

<class 'list'>

 

 

#可以追加一个元素到一个链表,介不能到一个字符串

>>> vocab.append('blog')

>>> raw.append('blog')

Traceback (most recent call last):

 File "<input>", line 1, in <module>

AttributeError: 'str' object has noattribute 'append'

 

#字符串+字符串  链表+链表  链表不能加字符串

>>> query='Who knows?'

>>>beatles=['john','paul','george','ringo']

>>> query+beatles

Traceback (most recent call last):

 File "<input>", line 1, in <module>

TypeError: Can't convert 'list' object tostr implicitly

 

 

2 字符串:最底层的文本处理

#字符串的基本操作


monty = 'Monty python'
print(monty)

circus = "Monty python's Flying Circus"
print(circus)

circus = 'Monty python\'s Flying Circus'
print(circus)

 

 

#多行文本可以使用反斜杠连接或是括号

>>> couplet="Shall I comparethee to a Summer's day?"\

... "Thou are more lovely and moretemperate:"

>>> print(couplet)

Shall I compare thee to a Summer's day?Thouare more lovely and more temperate:

 

>>> couplet=("Shall I comparethee to a Summer's day?"

... "Thou are more lovely and moretemperate:")

>>> print(couplet)

Shall I compare thee to a Summer's day?Thouare more lovely and more temperate:

 

 

#以上方法显示的字符串没有换行,可以使用三引号

>>> couplet='''Shall I comparethee to a Summer's day?

... Thou are more lovely and moretemperate:'''

>>> print(couplet)

Shall I compare thee to a Summer's day?

Thou are more lovely and more temperate:

 

>>>couplet="""Shall I compare thee to a Summer's day?

... Thou are more lovely and moretemperate:"""

>>> print(couplet)

Shall I compare thee to a Summer's day?

Thou are more lovely and more temperate:

 

 

#字符串的连接操作

>>> 'very'+'very'+'very'

'veryveryvery'

>>> 'very'*3

'veryveryvery'

 

#输出字符串,使用print()

 

#访问单个字符串

>>> monty='Monty python'

>>> monty[0]

'M'

 

#正数为正向索引,负数为反向索引

>>> monty[-1]

'n

 

#不要在行尾输出换行符 ??

 

#字符转小写,过滤非字母字符

>>> import nltk

>>> from nltk.corpus importgutenberg

>>>raw=gutenberg.raw('melville-moby_dick.txt')

>>> fdist=nltk.FreqDist(ch.lower()for ch in raw if ch.isalpha())

>>> fdist.keys()

dict_keys(['s', 'z', 'r', 'h', 'a', 'i','n', 'b', 't', 'j', 'o', 'e', 'c', 'm', 'x', 'y', 'g', 'd', 'q', 'v', 'w', 'f','k', 'p', 'u', 'l'])

 

#访问子字串,类似于对链表的切片操作

>>> monty='Monty python'

>>> monty[6:10]

'pyth'

#使用负数索引

>>> monty[-12:-7]

'Monty'

 

#分别从字串头和尾开始

>>> monty[:5]

'Monty'

>>> monty[6:]

'python'

 

#测试字串被包含

>>> phrase='And now for something completelydifferent'

>>> if 'thing' in phrase:

...    print("found 'thing")

...

found 'thing

 

#使用find()查找子字串位

>>> monty.find('python')

6

 

#查看字符串的更多操作

>>> help(str)

Help on class str in module builtins:

 

class str(object)

 | str(object='') -> str

 |  str(bytes_or_buffer[,encoding[, errors]]) -> str

 | 

 | Create a new string object from the given object. If encoding or

 | errors is specified, then the object must expose a data buffer

 |  thatwill be decoded using the given encoding and error handler.

 | Otherwise, returns the result of object.__str__() (if defined)

 |  orrepr(object).

 | encoding defaults to sys.getdefaultencoding().

 | errors defaults to 'strict'.

 | 

 | Methods defined here:

 | 

 | __add__(self, value, /) #私有方法

 |     Return self+value.

 | 

 | __contains__(self, key, /)

 |     Return key in self.

 | 

 | __eq__(self, value, /)

 |     Return self==value.

 | 

 | __format__(...)

 |     S.__format__(format_spec) -> str

 |     

 |     Return a formatted version of S as described by format_spec.

 | 

 | __ge__(self, value, /)

 |     Return self>=value.

 | 

 | __getattribute__(self, name, /)

 |     Return getattr(self, name).

 | 

 | __getitem__(self, key, /)

 |     Return self[key].

 | 

 | __getnewargs__(...)

 | 

 | __gt__(self, value, /)

 |     Return self>value.

 | 

 | __hash__(self, /)

 |     Return hash(self).

 | 

 | __iter__(self, /)

 |     Implement iter(self).

 | 

 | __le__(self, value, /)

 |     Return self<=value.

 | 

 | __len__(self, /)

 |     Return len(self).

 | 

 | __lt__(self, value, /)

 |     Return self<value.

 | 

 | __mod__(self, value, /)

 |     Return self%value.

 | 

 | __mul__(self, value, /)

 |     Return self*value.n

 | 

 | __ne__(self, value, /)

 |     Return self!=value.

 | 

 | __new__(*args, **kwargs) from builtins.type

 |     Create and return a new object. See help(type) for accurate signature.

 | 

 | __repr__(self, /)

 |     Return repr(self).

 | 

 | __rmod__(self, value, /)

 |     Return value%self.

 | 

 | __rmul__(self, value, /)

 |     Return self*value.

 | 

 | __sizeof__(...)

 |     S.__sizeof__() -> size of S in memory, in bytes

 | 

 | __str__(self, /)

 |     Return str(self).

 | 

 | capitalize(...)

 |     S.capitalize() -> str

 |     

 |     Return a capitalized version of S, i.e. make the first character

 |     have upper case and the rest lower case.

 | 

 | casefold(...)

 |     S.casefold() -> str

 |     

 |     Return a version of S suitable for caseless comparisons.

 | 

 | center(...)

 |     S.center(width[, fillchar]) -> str

 |     

 |     Return S centered in a string of length width. Padding is

 |     done using the specified fill character (default is a space)

 | 

 | count(...) #字符串中字符数量

 |     S.count(sub[, start[, end]]) -> int

 |     

 |     Return the number of non-overlapping occurrences of substring sub in

 |     string S[start:end].  Optionalarguments start and end are

 |     interpreted as in slice notation.

 | 

 | encode(...)

 |     S.encode(encoding='utf-8', errors='strict') -> bytes

 |     

 |     Encode S using the codec registered for encoding. Default encoding

 |     is 'utf-8'. errors may be given to set a different error

 |     handling scheme. Default is 'strict' meaning that encoding errors raise

 |      aUnicodeEncodeError. Other possible values are 'ignore', 'replace' and

 |     'xmlcharrefreplace' as well as any other name registered with

 |     codecs.register_error that can handle UnicodeEncodeErrors.

 | 

 | endswith(...) #是否以指定字符串结尾

 |     S.endswith(suffix[, start[, end]]) -> bool

 |     

 |     Return True if S ends with the specified suffix, False otherwise.

 |     With optional start, test S beginning at that position.

 |     With optional end, stop comparing S at that position.

 |     suffix can also be a tuple of strings to try.

 | 

 | expandtabs(...)

 |     S.expandtabs(tabsize=8) -> str

 |     

 |     Return a copy of S where all tab characters are expanded using spaces.

 |     If tabsize is not given, a tab size of 8 characters is assumed.

 | 

 | find(...) #查找子字串的第一个索引

 |     S.find(sub[, start[, end]]) -> int

 |     

 |     Return the lowest index in S where substring sub is found,

 |     such that sub is contained within S[start:end].  Optional

 |     arguments start and end are interpreted as in slice notation.

 |     

 |     Return -1 on failure.

 | 

 | format(...) #格式化字串

 |     S.format(*args, **kwargs) -> str

 |     

 |     Return a formatted version of S, using substitutions from args andkwargs.

 |     The substitutions are identified by braces ('{' and '}').

 | 

 | format_map(...)

 |     S.format_map(mapping) -> str

 |     

 |     Return a formatted version of S, using substitutions from mapping.

 |     The substitutions ar

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值