我编写了一个递归函数,它将以以下格式返回字典中标签中所有文本的XPATH:
{'xpath1': {'text': 'text1'}, 'xpath2': {'text': 'text2'}, ...}
码:
from bs4 import BeautifulSoup, NavigableString
def get_xpaths_dict(soup, xpaths={}, curr_path=''):
curr_path += '/{}'.format(soup.name)
for item in soup.contents:
if isinstance(item, NavigableString):
if item.strip():
try:
xpaths[curr_path]['count'] += 1
count = xpaths[curr_path]['count']
curr_path += '[{}]'.format(count)
xpaths[curr_path] = {'text': item.strip()}
except KeyError:
xpaths[curr_path] = {'text': item.strip(), 'count': 1}
else:
xpaths = get_xpaths_dict(item, xpaths, curr_path)
return xpaths
html &