python3 解析html_python3如何解析html

3f339eadc091302d2994fab99169cc87.png

解析html是爬虫后的重要的一个处理数据的环节。一下记录解析html的几种方式。

先介绍基础的辅助函数,主要用于获取html并输入解析后的结束。#把传递解析函数,便于下面的修改

def get_html(url, paraser=bs4_paraser):

headers = {

'Accept': '*/*',

'Accept-Encoding': 'gzip, deflate, sdch',

'Accept-Language': 'zh-CN,zh;q=0.8',

'Host': 'www.360kan.com',

'Proxy-Connection': 'keep-alive',

'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116

Safari/537.36'

}

request = urllib2.Request(url, headers=headers)

response = urllib2.urlopen(request)

response.encoding = 'utf-8'

if response.code == 200:

data = StringIO.StringIO(response.read())

gzipper = gzip.GzipFile(fileobj=data)

data = gzipper.read()

value = paraser(data) # open('E:/h5/haPkY0osd0r5UB.html').read()

return value

else:

pass

value = get_html('http://www.360kan.com/m/haPkY0osd0r5UB.html', paraser=lxml_parser)

for row in value:

print row

1、lxml.html的方式进行解析。

The lxml XML toolkit is a Pythonic binding for the C libraries libxml2 and libxslt. It is unique in that it combines the speed and XML feature completeness of these libraries with the simplicity of a native Python API, mostly compatible but superior to the well-known ElementTree API. The latest release works with all CPython versions from 2.6 to 3.5. See the introduction for more information about background and goals of the lxml project. Some common questions are answered in the FAQ. [官网](http://lxml.de/)def lxml_parser(page):

data = []

doc = etree.HTML(page)

all_div = doc.xpath('//div[@class="yingping-list-wrap"]')

for row in all_div:

# 获取每一个影评,即影评的item

all_div_item = row.xpath('.//div[@class="item"]') # find_all('div', attrs={'class': 'item'})

for r in all_div_item:

value = {}

# 获取影评的标题部分

title = r.xpath('.//div[@class="g-clear title-wrap"][1]')

value['title'] = title[0].xpath('./a/text()')[0]

value['title_href'] = title[0].xpath('./a/@href')[0]

score_text = title[0].xpath('./div/span/span/@style')[0]

score_text = re.search(r'\d+', score_text).group()

value['score'] = int(score_text) / 20

# 时间

value['time'] = title[0].xpath('./div/span[@class="time"]/text()')[0]

# 多少人喜欢

value['people'] = int(

re.search(r'\d+', title[0].xpath('./div[@class="num"]/span/text()')[0]).group())

data.append(value)

return data

2、使用BeautifulSoup,不多说了,大家网上找资料看看。def bs4_paraser(html):

all_value = []

value = {}

soup = BeautifulSoup(html, 'html.parser')

# 获取影评的部分

all_div = soup.find_all('div', attrs={'class': 'yingping-list-wrap'}, limit=1)

for row in all_div:

# 获取每一个影评,即影评的item

all_div_item = row.find_all('div', attrs={'class': 'item'})

for r in all_div_item:

# 获取影评的标题部分

title = r.find_all('div', attrs={'class': 'g-clear title-wrap'}, limit=1)

if title is not None and len(title) > 0:

value['title'] = title[0].a.string

value['title_href'] = title[0].a['href']

score_text = title[0].div.span.span['style']

score_text = re.search(r'\d+', score_text).group()

value['score'] = int(score_text) / 20

# 时间

value['time'] = title[0].div.find_all('span', attrs={'class': 'time'})[0].string

# 多少人喜欢

value['people'] = int(

re.search(r'\d+', title[0].find_all('div', attrs={'class': 'num'})[0].span.string).group())

# print r

all_value.append(value)

value = {}

return all_value

3、使用SGMLParser,主要是通过start、end tag的方式进行,解析工程比较明朗,但是有点麻烦,而该案例的场景不太适合该方法。class CommentParaser(SGMLParser):

def __init__(self):

SGMLParser.__init__(self)

self.__start_div_yingping = False

self.__start_div_item = False

self.__start_div_gclear = False

self.__start_div_ratingwrap = False

self.__start_div_num = False

# a

self.__start_a = False

# span 3中状态

self.__span_state = 0

# 数据

self.__value = {}

self.data = []

def start_div(self, attrs):

for k, v in attrs:

if k == 'class' and v == 'yingping-list-wrap':

self.__start_div_yingping = True

elif k == 'class' and v == 'item':

self.__start_div_item = True

elif k == 'class' and v == 'g-clear title-wrap':

self.__start_div_gclear = True

elif k == 'class' and v == 'rating-wrap g-clear':

self.__start_div_ratingwrap = True

elif k == 'class' and v == 'num':

self.__start_div_num = True

def end_div(self):

if self.__start_div_yingping:

if self.__start_div_item:

if self.__start_div_gclear:

if self.__start_div_num or self.__start_div_ratingwrap:

if self.__start_div_num:

self.__start_div_num = False

if self.__start_div_ratingwrap:

self.__start_div_ratingwrap = False

else:

self.__start_div_gclear = False

else:

self.data.append(self.__value)

self.__value = {}

self.__start_div_item = False

else:

self.__start_div_yingping = False

def start_a(self, attrs):

if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear:

self.__start_a = True

for k, v in attrs:

if k == 'href':

self.__value['href'] = v

def end_a(self):

if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear and self.__start_a:

self.__start_a = False

def start_span(self, attrs):

if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear:

if self.__start_div_ratingwrap:

if self.__span_state != 1:

for k, v in attrs:

if k == 'class' and v == 'rating':

self.__span_state = 1

elif k == 'class' and v == 'time':

self.__span_state = 2

else:

for k, v in attrs:

if k == 'style':

score_text = re.search(r'\d+', v).group()

self.__value['score'] = int(score_text) / 20

self.__span_state = 3

elif self.__start_div_num:

self.__span_state = 4

def end_span(self):

self.__span_state = 0

def handle_data(self, data):

if self.__start_a:

self.__value['title'] = data

elif self.__span_state == 2:

self.__value['time'] = data

elif self.__span_state == 4:

score_text = re.search(r'\d+', data).group()

self.__value['people'] = int(score_text)

pass

def sgl_parser(html):

parser = CommentParaser()

parser.feed(html)

return parser.data

4、HTMLParaer,与3原理相识,就是调用的方法不太一样,基本上可以公用。class CommentHTMLParser(HTMLParser.HTMLParser):

def __init__(self):

HTMLParser.HTMLParser.__init__(self)

self.__start_div_yingping = False

self.__start_div_item = False

self.__start_div_gclear = False

self.__start_div_ratingwrap = False

self.__start_div_num = False

# a

self.__start_a = False

# span 3中状态

self.__span_state = 0

# 数据

self.__value = {}

self.data = []

def handle_starttag(self, tag, attrs):

if tag == 'div':

for k, v in attrs:

if k == 'class' and v == 'yingping-list-wrap':

self.__start_div_yingping = True

elif k == 'class' and v == 'item':

self.__start_div_item = True

elif k == 'class' and v == 'g-clear title-wrap':

self.__start_div_gclear = True

elif k == 'class' and v == 'rating-wrap g-clear':

self.__start_div_ratingwrap = True

elif k == 'class' and v == 'num':

self.__start_div_num = True

elif tag == 'a':

if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear:

self.__start_a = True

for k, v in attrs:

if k == 'href':

self.__value['href'] = v

elif tag == 'span':

if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear:

if self.__start_div_ratingwrap:

if self.__span_state != 1:

for k, v in attrs:

if k == 'class' and v == 'rating':

self.__span_state = 1

elif k == 'class' and v == 'time':

self.__span_state = 2

else:

for k, v in attrs:

if k == 'style':

score_text = re.search(r'\d+', v).group()

self.__value['score'] = int(score_text) / 20

self.__span_state = 3

elif self.__start_div_num:

self.__span_state = 4

def handle_endtag(self, tag):

if tag == 'div':

if self.__start_div_yingping:

if self.__start_div_item:

if self.__start_div_gclear:

if self.__start_div_num or self.__start_div_ratingwrap:

if self.__start_div_num:

self.__start_div_num = False

if self.__start_div_ratingwrap:

self.__start_div_ratingwrap = False

else:

self.__start_div_gclear = False

else:

self.data.append(self.__value)

self.__value = {}

self.__start_div_item = False

else:

self.__start_div_yingping = False

elif tag == 'a':

if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear and self.__start_a:

self.__start_a = False

elif tag == 'span':

self.__span_state = 0

def handle_data(self, data):

if self.__start_a:

self.__value['title'] = data

elif self.__span_state == 2:

self.__value['time'] = data

elif self.__span_state == 4:

score_text = re.search(r'\d+', data).group()

self.__value['people'] = int(score_text)

pass

def html_parser(html):

parser = CommentHTMLParser()

parser.feed(html)

return parser.data

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值