期末的Python考试要写一个爬取网站信息的程序,我就选取了b站番剧索引页面作为目标网页(因为感觉番剧主页的信息太杂了。)
目标网页:https://www.bilibili.com/anime/index
原本打算魔改老师给的范例使用BeautifulSoup库来解析html获取数据的,
但是在运行的时候发现。好像获取不了数据?
原先使用的代码:
app.py (主程序)
import requests
from bs4 import BeautifulSoup
from writetext import TextStorage
from datetime import datetime
import os
class MySpider(object):
header = {
"Referer": "",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36",
}
def __init__(self, url, directory):
self.s = requests.Session()
self.url = url
self.dir = directory
if not os.path.exists(directory):
os.mkdir(directory)
self.idx = 0
def crawling(self):
rsp = self.s.get(self.url, headers=MySpider.header)
soup = BeautifulSoup(rsp.text,"html.parser")
tag = soup.find("ul", class_="bangumi-list") # 该处设置断点
tags_li = tag.find_all("li") # 该处设置断点
with TextStorage() as xs:
for li in tags_li:
image_url = "https:%s" % li.a.div.img['src'] # 这个获取番剧封面
print(image_url)
filename = self.save_image(image_url)
content = li.find("a", class_="bangumi-title").string # 这个获取番剧标题
url = self.dir + '/' + filename
xs.write(content, url)
def save_image(self, image_url):
image = self.s.get(image_url, headers=MySpider.header)
now = datetime.now()
suffix = now.strftime('%Y%m%d_%H%M')
name = "img_%s_%d.jpg" % (suffix, self.idx)
self.idx += 1
with open(self.dir + '/' + name, 'wb') as file:
file.write(image.content)
return name
if __name__ == "__main__":
MySpider("https://www.bilibili.com/anime/index", # 爬取的目标网页
"./爬取的图片").crawling() # 爬取的图片存放位置
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24