利用BeautifulSoup包来爬取网站的源码和图片
download_pages2()是爬取网页源码的函数
import re
import time
import requests
from bs4 import BeautifulSoup
def download_pages2(url, type, target):
tar_url = url
resp = requests.get(url)
result = BeautifulSoup(resp.text, 'lxml')
# links = re.findall('<a href="(.+?)"', resp.text)
links = result.find_all('a')
for link in links:
link = link['href']
# if 'articleid' in link:
# continue
# if link.startswith('#'):
# continue
if link.startswith('/'):
link = tar_url + link
resp = requests.get(link)
resp.encoding = 'utf-8'
download_file(target, link, type)
download_images()是爬取网页图片的函数
def download_images(url, type, target):
resp = requests.get('http://www.woniunote.com')
# images = re.findall('<img src="(.+?)"', resp.text)
result = BeautifulSoup(resp.text, 'xml')
images = result.find_all('img')
for image in images:
image = image['src']
# 处理图片的url地址
if image.startswith('/'): # 如果以'/'开头则拼接成一个url
image = url + image
if image.split('.')[-1] != 'jpg' and image.split('.')[-1] != 'png': # 以','为分隔符的最后一个字符如果不是'jpg'或者'png' 则退出此次循环
continue
download_file(target, image, type)
这是一个下载函数,通过传递类型来决定调用哪个函数,调用下载图片函数还是调用下载页面函数
def download_file(target_name,t_url, target_type): # target_name存贮的位置, t_url目标的url地址, target_type存储的类型
resp = requests.get(t_url)
if target_type == 'page':
filename = t_url.split('/')[-1] + time.strftime("-%Y%m%d-%H%M%S") + '.html'
documentary = target_name + filename
# print(documentary)
with open(f'{documentary}', mode='w', encoding='utf-8') as file:
file.write(resp.text)
print(f'已经获取page:{t_url}')
elif target_type == 'photo':
filename = time.strftime("-%Y%m%d-%H%M%S") + t_url.split('/')[-1]
documentary = target_name + filename
# print(documentary)
with open(f'{documentary}', mode='wb') as file:
file.write(resp.content)
print(f'已经获取image:{t_url}')
else:
print("ERROR")
主函数,用来传递url,type,target
url是要爬取的网站,type是爬取的资源类型, target是爬取文件的保存目录
url: http://www.xxxx.com
type: page
target: ./download/page/
def spider():
url = input("请输入爬取的网站:")
type = input("请输入爬取的类型(page/photo):")
target = input("请输入存贮位置:")
if type == 'photo':
download_images(url, type, target)
elif type == 'page':
download_pages2(url, type, target)
else:
print("ERROR")
if __name__ == '__main__':
# download_pages2()
# download_images()
# download_file()
spider()
第一次写,用于记录自己的学习历程,还请大佬多多指正