python读取html_如何使用Python从HTML文件中提取文本?

可以尝试如下代码:"""

HTML <-> text conversions.

"""

from HTMLParser import HTMLParser, HTMLParseError

from htmlentitydefs import name2codepoint

import re

class _HTMLToText(HTMLParser):

def __init__(self):

HTMLParser.__init__(self)

self._buf = []

self.hide_output = False

def handle_starttag(self, tag, attrs):

if tag in ('p', 'br') and not self.hide_output:

self._buf.append('\n')

elif tag in ('script', 'style'):

self.hide_output = True

def handle_startendtag(self, tag, attrs):

if tag == 'br':

self._buf.append('\n')

def handle_endtag(self, tag):

if tag == 'p':

self._buf.append('\n')

elif tag in ('script', 'style'):

self.hide_output = False

def handle_data(self, text):

if text and not self.hide_output:

self._buf.append(re.sub(r'\s+', ' ', text))

def handle_entityref(self, name):

if name in name2codepoint and not self.hide_output:

c = unichr(name2codepoint[name])

self._buf.append(c)

def handle_charref(self, name):

if not self.hide_output:

n = int(name[1:], 16) if name.startswith('x') else int(name)

self._buf.append(unichr(n))

def get_text(self):

return re.sub(r' +', ' ', ''.join(self._buf))

def html_to_text(html):

"""

Given a piece of HTML, return the plain text it contains.

This handles entities and char refs, but not javascript and stylesheets.

"""

parser = _HTMLToText()

try:

parser.feed(html)

parser.close()

except HTMLParseError:

pass

return parser.get_text()

def text_to_html(text):

"""

Convert the given text to html, wrapping what looks like URLs with tags,

converting newlines to
tags and converting confusing chars into html

entities.

"""

def f(mo):

t = mo.group()

if len(t) == 1:

return {'&':'&', "'":''', '"':'"', '<':'<', '>':'>'}.get(t)

return '%s' % (t, t)

return re.sub(r'https?://[^] ()"\';]+|[&\'"<>]', f, text)

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值