爬取B站首页图片和文字并制作词云图
1. 爬取B站首页部分图片并将图片的路径保存在txt文件中
import requests
from bs4 import BeautifulSoup
import re
# 1. 发送请求,获取疫情首页
response = requests.get('https://www.bilibili.com/')
html = response.content.decode()
# 2. 使用BeautifulSoup提取数据
soup = BeautifulSoup(html, 'lxml')
# 2.1 获取img标签
img = soup.findAll('img')
# print(img)
# 3. 保存到文件
with open('picture.txt', 'w') as fp:
for src in img:
fp.write(str(src.get('src'))+"\n")
下面是执行后保存的picture.txt文件
https://i0.hdslb.com/bfs/vc/c1e19150b5d1e413958d45e0e62f012e3ee200af.png
//i0.hdslb.com/bfs/feed-admin/4f30e55dedb8b3eb3815e34fbf3f46f5fcf3b9b0.png@880w_388h_1c_95q
//i0.hdslb.com/bfs/feed-admin/e84c14afb6a8270e0a2d709017644349f5563dfa.jpg@880w_388h_1c_95q
https://i0.hdslb.com/bfs/sycp/creative_img/202103/dfe7afea68c5b3266ec92f310a1b2a24.jpg@880w_388h_1c_95q
//i0.hdslb.com/bfs/feed-admin/82bd34a651837f76fea68772719ec1936b4b892f.jpg@880w_388h_1c_95q
//i0.hdslb.com/bfs/feed-admin/22a7ee1972b5d8b49b2f11adb1186ae317cd3f3a.jpg@880w_388h_1c_95q
//i0.hdslb.com/bfs/archive/368d12cf903a88a69ac37fdd6d59489c030629fa.jpg@412w_232h_1c.jpg
//i1.hdslb.com/bfs/archive/132650480ce6372eb6d2224be1e7b54530452278.jpg@412w_232h_1c.jpg
//i1.hdslb.com/bfs/archive/f5f946958509e35634e246d1d0669cabee51af30.jpg@412w_232h_1c.jpg
//i1.hdslb.com/bfs/archive/c38d2157323af84161233fbf674e268c6273f3ab.jpg@412w_232h_1c.jpg
//i0.hdslb.com/bfs/archive/c49ca8cdd59455e70314700b711422eb9b23f485.jpg@412w_232h_1c.jpg
//i1.hdslb.com/bfs/archive/2d65e70248889455b0d18d7f0efdaa05ba025178.jpg@412w_232h_1c.jpg
//i1.hdslb.com/bfs/archive/db5b389bf5ca131c60f9e872b6bbb4aae00a7562.jpg@412w_232h_1c.jpg
//i1.hdslb.com/bfs/archive/bdbe85a2472a61085c311ac7397de683e5bbfeef.jpg@412w_232h_1c.jpg
//i2.hdslb.com/bfs/archive/2c5003007d6b87e5a148f9b2064fc77a28f72664.jpg@412w_232h_1c.jpg
//i1.hdslb.com/bfs/archive/f102c90d7d72702b8afbbdaa8771da272e199b6a.jpg@412w_232h_1c.jpg
https://i0.hdslb.com/bfs/sycp/creative_img/202103/1ad63ab1b6774f66bf4c785263ad2c2c.jpg@412w_232h_1c
//i0.hdslb.com/bfs/feed-admin/6eacdd858bdc4824714d3210d5c6fb5300aca0f8.png@412w_232h_1c
https://i0.hdslb.com/bfs/sycp/creative_img/202101/3221b44b56c290bfe5e4ebb2226da364.jpg@412w_232h_1c
//i0.hdslb.com/bfs/archive/ece9eb5eafb7be37b08352947a20503fb8178ac9.jpg@412w_232h_1c
//i0.hdslb.com/bfs/archive/998f64c84a87d0684288eede17e138d7fa343a7e.jpg@412w_232h_1c
//i0.hdslb.com/bfs/archive/dbaf1bac582d1909c2626f9a6cfe61ed928baa72.jpg@412w_232h_1c
//i0.hdslb.com/bfs/feed-admin/c5485c331cddb03e6e7cd7f1d7739b9b1ad516a7.jpg
//i0.hdslb.com/bfs/archive/31b8d2617cb8d6b01e98425b7eba39ae9f973c20.png
2. 爬取B站首页文字
import requests
from bs4 import BeautifulSoup
import re
# 1. 发送请求,获取疫情首页
response = requests.get('https://www.bilibili.com/')
html = response.content.decode()
# 2. 使用BeautifulSoup提取数据
soup = BeautifulSoup(html, 'html.parser')
img = soup.findAll('img')
# 3. 图片保存在文件中
# with open('picture.txt', 'w') as fp:
# for src in img:
# fp.write(str(src.get('src'))+"\n")
# 3. 标题保存到文件
with open('title.txt', 'w', encoding='utf-8') as fp:
for alt in img:
# 获取的txt文档中有None 需要进行筛选
if alt.get('alt'):
fp.write(str(alt.get('alt'))+"\n")
下面是爬取的文字title.txt
蜘蛛子的地上冒险要开始了~
喜剧人的像素风游戏
DLC坏种现已上线
从动物视角看世界
真正的交流会上线了?
B 界 等 级 修 仙 传
【Sylar】《山丘》——我的兄弟阿圾
你为什么看不懂酒单?只学六杯直接毕业!
可莉,意大利炮,开炮!!
【重策小屋01】《西国建筑师》——能抓人的工人放置游戏
偶像的治愈魔法 终归无法治愈自己
【阿斗】集集惊心动魄!越狱计划频发变故人员超载,经典美剧《越狱》第一季8-11集全解读
异世界死亡轮回,三分钟带你重温Re0的感动瞬间!【Re0系列混剪/圣域篇完结纪念】
解析十字军长征之路 | 如何在东方建立信仰的天国
【怪物猎人rise】15分钟赚55W!游戏最强赚钱攻略
令人血脉偾张的全新盛宴
游戏赛事库
3. 制作词云图
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import jieba
# 1. 打开文件
with open('title.txt', "r", encoding="utf-8") as f:
text = f.read()
# 2. jieba分词
text_list = jieba.cut(text, cut_all=False)
string = " ".join(text_list)
# 3. 设置词云 --- 想要中文显示字体 必须设置font_path
worldcloud = WordCloud(font_path="C:\WINDOWS\FONTS\MSYH.TTC", background_color="black", width=1000, height=860, margin=2).generate(string)
plt.imshow(worldcloud)
plt.axis("off")
plt.show()
worldcloud.to_file('cloud.png')
cloud.png展示效果如下: