from lxml import etree
def range_file(page):
parser = etree.HTMLParser(encoding="utf-8")
key = str(page)+'.html'
text = '尤果'+key
html = etree.parse(text, parser=parser)
result = html.xpath('//img/@src')
file = open('信息.txt', 'a', encoding='utf-8')
for i in range(len(result)):
if len(result[i])>30:
print(result[i])
file.writelines(result[i]+'\n')
file.close()
for i in range(10):
range_file(i)
如要提取image的src的内容 可以直接使用 //img/@src,因为src里存在一些垃圾数据我直接用长度过滤,过滤掉