XPATH语法:
// 定位根标签
/ 往下层寻找
/text() 提取文本内容
/@xxx 提取属性内容
Sample:
import requests
from lxml import etree
for i in range(1, 21):
url = "http://www.xxx.com/topic/tv/page/{}".format(i)
req = requests.get(url).content
html = etree.HTML(req)
# 提取文本
text = html.xpath(
'/html/body/section/div[1]/div/article[*]/header/h2/a/text()')
for each in text:
print(1,each)
# 提取链接
# link = html.xpath('//a/@href')
# for i in link:
# print(i)