1. 爬取并保存
re.text用于文本内容
re.content用于图片、视频、音频等
import requests
# 发出http请求
re = requests.get('https://apiv3.shanbay.com/codetime/articles/mnvdu')
# 查看响应状态
print('网页的状态码为%s'%re.status_code)
with open('鲁迅文章.txt', 'w') as file:
# 将数据的字符串形式写入文件中
print('正在爬取小说')
file.write(re.text)
import requests
# 发出http请求
#下载图片
res=requests.get('https://img-blog.csdnimg.cn/20210424184053989.PNG')
# 以二进制写入的方式打开一个名为 info.jpg 的文件
with open('datawhale.png','wb') as ff:
# 将数据的二进制形式写入文件中
ff.write(res.content)
2. BeautifulSoup
爬取学校官网一级页面、二级页面和三级页面
import json
import requests
from bs4 import BeautifulSoup
url_list = []
# Get URLs
def getURLs (url):
html = requests.get(url)
soup = BeautifulSoup(html.text, 'html.parser')
links = soup.find_all('a')
for link in links:
url_href = link.get("href")
url_list.append(url_href)
# Complement relative path
url_str = list(map(lambda x: str(x), url_list))
for i in range(len(url_str)):
if(len(url_str[i]) <= 0):
continue
if url_str[i][0] == "/":
url_tail = url_str[i][1:]
url_str[i] = url + url_tail
# Filter out FST web page and Remove duplicated items
url_fst = list(set(filter(lambda x: "https://www.fst" in x, url_str)))
url_fst = list(set(filter(lambda x: x[-1]=="/", url_fst)))
return url_fst
# Save the output
def save (data):
conv = json.dumps(data)
f = open(r"C:\Users\Sandra\Desktop\url\url.txt", "a",encoding='UTF-8')
f.write(conv+"\n")
f.close()
# Homepage
print("Homepage")
result1 = getURLs('https://www.fst.um.edu.mo/')
# Subpages
print("Subpages")
result2=[]
for i in result1:
if i!="https://www.fst.um.edu.mo/":
result2=result2+getURLs(i)
# Remove same items in both 1st and 2ed layer links
uniq_result2=[]
result2_=set(result2)|set(result1)
uniq_result2=list(result2_-set(result1))
# Subsubpages
print("Subsubpages")
result3=[]
for j in uniq_result2:
result3=result3+getURLs(j)
fst_Urls = list(set(result1+uniq_result2+result3))
for item in fst_Urls:
save(item)
print(len(fst_Urls))