import requests
from lxml import etree
import time
"""
场景:爬取煎蛋网20个页面的所有标题
"""
def jandan_index_title(u_num,t_num,u_rl="http://jandan.net/page/",xpath="//*[@id='content']/div[%s]/div/h2/a"):
"""
:param u_num: 传入url参数
:param t_num: 传入title参数
:param u_rl: 因为这里要下载很多个页面,我们观察url规律,做了url的参数化
:param xpath: 因为一个页面上的不同的标题位置对应的xpath不一致,观察了规律,做了xpath的参数化
:return: title_list
"""
title_list = []
for i in range(1,u_num):
url = u_rl+"%s"%i
print(url)
res = requests.get(url).text
HTML = etree.HTML(res)
for i in range(1,t_num):
x_p = xpath%i
title = HTML.xpath(x_p)
for t in title:
t_text = t.text
with open("res.txt","a+"