视频有讲解
爬虫
# -*- codeing = utf-8 -*-
# @Time : 2021/1/26 10:19
# @Author : 老七疯狂吸氧
# @file hotlist1.py
# @Software:PyCharm
import requests
import re
import time
import urllib.parse
def main():
urllist={}
t = time.strftime('%Y-%m-%d', time.localtime(time.time()))
虎嗅网热文榜 = "https://tophub.today/n/5VaobgvAj1"
微博今日热榜 = "https://tophub.today/n/KqndgxeLl9"
知乎热榜 = "https://tophub.today/n/mproPpoq6O"
B站日榜 = "https://tophub.today/n/74KvxwokxM"
six氪日榜 = "https://tophub.today/n/Q1Vd5Ko85R"
吾爱破解日榜 = "https://tophub.today/n/NKGoRAzel6"
豆瓣电影新片榜 = "https://tophub.today/n/mDOvnyBoEB"
csdn技术区热帖 = "https://tophub.today/n/K7GdajgeQy"
urllist.update(微博今日热榜=[微博今日热榜,50],虎嗅网热文榜=[虎嗅网热文榜,15],csdn技术区热帖=[csdn技术区热帖,50],知乎热榜=[知乎热榜,50],B站日榜=[B站日榜,100],six氪日榜=[six氪日榜,10],吾爱破解日榜=[吾爱破解日榜,15],豆瓣电影新片榜=[豆瓣电影新片榜,10]) #将排行榜的网站和数量加入字典。
for key,value in urllist.items():
datalist = get_html(value[0])
hotname = saveurl(datalist)
keys = list(hotname.keys())
values = list(hotname.values())
n=0
for i in range(0,value[1]):
n+=1
content = str(n)+"."+keys[i]+" "+values[i][0]+" "+values[i][1]
savelist(content,key,t)
# print(n,".",keys[i]," ",values[i][0]," ",values[i][1])
print("爬取", key, "完毕")
def get_html(url): #一次请求
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
} #请输入你个人的User-Agent
response = requests.get(url, headers=headers)
return response.text
def saveurl(baseurl): #筛选内容
findlink=re.compile(r'<td class="al"><a href=".*?" target="_blank" rel="nofollow" itemid=".*?">(.*?)</a></td>')
findlink2=re.compile(r'<td>(.*?)</td>')
findlink3=re.compile(r'<td class="al"><a href="(.*?)" target=')
cid = re.findall(findlink,baseurl)
hot = re.findall(findlink2,baseurl)
url = re.findall(findlink3,baseurl)
m=0
for i in url:
url[m]=splicing(url[m])
m+=1
news = {}
j=0
for i in hot:
news.update({cid[j]:[i,url[j]]})
j+=1
# cid = list(cid)[1]
return news
def savelist(list,name,t): #存储数据
txtname=name+t
File = open(txtname+".txt", "a", encoding="utf-8")
File.writelines(list+"\n")
File.close()
def splicing(get_url):
url = 'https://tophub.today'
next_url = urllib.parse.urljoin(url , get_url )
return next_url
if __name__ == '__main__':
main()
如果好用可以点个赞加个关注。