Harser 是一个简单的 Python HTML 解析器。
安装:pip install harser
示例代码:
>>> from harser import Harser
>>> HTML = '''
Lorem Ipsum
Dolor sit amet
first block
second block
third block
fourth layer
foo ter
'''
>>> harser = Harser(HTML)
>>> harser.find('div', class_='header').children(class_='nav-item').find('text').extract()
# Or just
# harser.find(class_='nav-item').find('text').extract()
['First item', 'Second item', 'Third item']
>>> harser.find(class_='nav-item').get_attr('href').extract()
['/nav1', '/nav2', '/nav3']
# It is equally
>>> harser.find('div', class_='header', id='id-header')
>>> harser.find('div', attrs={'class': 'header', 'id': 'id-header'})
>>> harser.find(id__contains='bar').get_attr('class').extract()
['footer']
>>> harser.find(href__not_contains='2').find('text').extract()
['First item', 'Third item']
>>> harser.find(attrs={'data-nav__contains': 'second'}).next_siblings().find('text').extract()
['Third item']
>>> harser.find('li').parent().next_siblings(filters={'text__contains': 'Second'}).clean_extract()
['
>>> harser.find('h3', filters={'span.@id__starts_with': 'foo'}).get_attr('some-attr').extract()
['hey']
>>> harser.find('div').children('h3').xpath
'//descendant::div/h3'