先上程序(爬取百度热榜)
'百度热榜爬虫'
import requests
from lxml import etree
import os
if __name__ =='__main__':
header = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4442.4 Safari/537.36'
}
url = 'https://tophub.today/'
page_text = requests.get(url=url,headers=header).text
etree = etree.HTML(page_text)
list_page = etree.xpath('//*[@id="node-2"]/div/div[2]/div[1]/a')
i=1
fp = open('百度热榜.txt', 'w', encoding='utf8')
print("****百度热榜****")
for li in list_page:
title = li.xpath('./div/span[2]/text()')[0]
hot_degree = li.xpath('./div/span[3]/text()')[0]
print(i,title,"点击量:"+hot_degree)
fp.write(str(i)+','+str(title)+' 点击量:'+str(hot_degree) + ' 04-27'+'\n')
i = i + 1
再来上程序(爬取微博热榜)
import requests
from lxml import etree
if __name__ =='__main__':
header = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4442.4 Safari/537.36'
}
url = 'https://tophub.today/'
page_text = requests.get(url=url,headers=header).text
etree = etree.HTML(page_text)
list_page = etree.xpath('//*[@id="node-1"]/div/div[2]/div[1]/a')
i=1
fp = open('微博热榜.txt', 'w', encoding='utf8')
for a in list_page:
title = a.xpath('./div/span[2]/text()')[0]
# print(title)
hot_degree = a.xpath('./div/span[3]/text()')[0]
#print(hot_degree)
print(i,title,"点击量:"+hot_degree)
fp.write(str(i)+','+str(title)+' 点击量:'+str(hot_degree) + ' 04-27'+'\n')
i=i+1
首先确定目标网站
https://tophub.today/
然后F12审查元素,发现我们需要定位的标签
最后再把爬取热榜信息输出到txt文本文件中。