废话不多说,直接开干!
说明
0.text获取标签包裹数据
或者解释用于html元素文本内容的存取
eg:element.text
1.attrib获取标签内的元素
eg:element.attrib['title'],element.attrib['href']
直接看代码
import asyncio
from pyppeteer import launch
from lxml import etree
async def main():
browser = await launch()
page = await browser.newPage()
await page.goto('https://movie.douban.com/chart')
await page.waitForXPath('//table//a[@title]')
doc = etree.HTML(await page.content())
# t = etree.tostring(doc, encoding="utf-8", pretty_print=True)
# print(t.decode("utf-8"))
for element in doc.xpath('//table//p[@class]'):
print(element.attrib['text'])
names = [element.attrib['title'] for element in doc.xpath('//table//a[@title]')]
print('Names: ', names)
await browser.close()
asyncio.get_event_loop().run_until_complete(main())