完整代码
from module.write_and_read import dump_excel
import requests
from lxml import html
head_list = []
def main(url, page, num):
url = url
headers = {'User-Agent':
'自己电脑的用户代理',
}
resp = requests.get(url, headers=headers)
text = html.etree.HTML(resp.text)
headline_list = text.xpath('//ul[@class="p-list10"]/li/div[@class="txt"]')
print(headline_list)
print(len(headline_list))
for headline in headline_list:
headline1 = headline.xpath('./h4[@class="tit"]/a[@href]/text()')
head_list.append(headline1)
headline1 = headline.xpath('./h4[@class="tit"]/a[@target="_blank"]/@href')
head_list[num].extend(["http://journal.whu.edu.cn" + headline1[0]])
headline1 = headline.xpath('./div[@class="date"]/text()')
head_list[num].extend(headline1)
num += 1
print(head_list)
return num
if __name__ == '__main__':
num = 0
page = 0
for i in range(1,10):
url = f"http://journal.whu.edu.cn/news/index/page/{i}"
num = main(url, page, num)
headers = ['标题', '网址', '时间']
dump_excel(headers,head_list, "test1")
pass