获取微博热搜爬虫（项目）

崖丫

已于 2022-01-20 16:39:51 修改

阅读量755

点赞数 1

分类专栏： python 文章标签：爬虫 python 数据挖掘

于 2020-10-11 09:55:50 首次发布

本文链接：https://blog.csdn.net/weixin_43824551/article/details/109008014

版权

python 专栏收录该内容

6 篇文章 0 订阅

订阅专栏

平台：

Crawlab（可以没有）

说明：

爬取当前微博热搜，存入TXT。补充说明，整理所有热搜，存入另一个TXT。

所用库：

Selector，requests，BeautifulSoup

任务分解：

1.爬取页面

2.准确找到目标位置

3.存入TXT

代码：

# -*- encoding: utf-8 -*-
# ---------------------------------------------
# 微博热搜
# ---------------------------------------------
import time
import re
import json
import sys
import copy
from collections import OrderedDict
from parsel import Selector
import requests#引入requests库用于下载网页
from bs4 import BeautifulSoup#BeautifulSoup用于解析网页

url="https://s.weibo.com/top/summary?Refer=top_hot&topnav=1&wvr=6"#所要爬取的网页
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}#
r=requests.get(url)#请求网站
r.encoding='utf-8'#对页面内容重新编码
data=r.text
soup=BeautifulSoup(data,'html.parser')#使用BeautifulSoup工具
#print(soup.prettify())#显示网站结构
a=[]#创建一个空列表
index=[]
for i in soup.find_all(class_="td-01 ranktop"): #把排名添加进空列表
    a.append(i.get_text().strip())
for k in soup.find_all('a',attrs={'href':re.compile('^/weibo?'),'target' : '_blank'}): #把热度标题添加进空列表
    index.append(k.get_text().strip())
nvs = zip(a,index)
data = dict( (a,index) for a,index in nvs)
f = open('test.txt','a')
localtime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())#获取时间
timee="爬取时间为:"+localtime
f.write(timee+'\n')
for i in data:
    f.write(str(i))
    f.write('\t')
    f.write(str(data[i]))
    f.write('\n')
f.close()

缺陷：

BeautifulSoup不快（不明显）；代码有点麻烦。

补充代码：

# -*- encoding: utf-8 -*-
# ---------------------------------------------
# 微博热搜查重版
# ---------------------------------------------
import time
import re
import json
import sys
import copy
from collections import OrderedDict
from parsel import Selector
import requests#引入requests库用于下载网页
from bs4 import BeautifulSoup#BeautifulSoup用于解析网页

url="https://s.weibo.com/top/summary?Refer=top_hot&topnav=1&wvr=6"#所要爬取的网页
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}#
r=requests.get(url)#请求网站
r.encoding='utf-8'#对页面内容重新编码
data=r.text
soup=BeautifulSoup(data,'html.parser')#使用BeautifulSoup工具
#print(soup.prettify())#显示网站结构
a=[]#创建一个空列表
index=[]
for k in soup.find_all('a',attrs={'href':re.compile('^/weibo?'),'target' : '_blank'}): #把热度标题添加进空列表
    index.append(k.get_text().strip()+'\n')
with open('olddata.txt', 'r', encoding='utf-8') as f:
    all_list= f.readlines()#将文件读进
f.close()
all_list = list(set(all_list+index))
for i in all_list:
    print(i.strip())
with open('olddata.txt', 'w', encoding='utf-8') as f:
    for i in all_list:
        f.write(i)
f.close()