使用正则以及BS4爬取图片
使用了requests库,正则表达式以及BS4来进行的一个爬虫,用了easygui进行一个简单的图形界面
import requests
import os
import re
from bs4 import BeautifulSoup
import easygui as g
def url_open(url):
header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0'}
response = requests.get(url,headers = header)
#获取图片文件用content,文字用text
html = response.content
return html
def get_imgs(html):
#调用BeautifulSoup解析html,这里用python自带的解析器,lxml解析器效率更高
soup = BeautifulSoup(html,"html.parser")
#用find_all方法找到包含需要的
img = str(soup.find_all(style="margin-bottom: 10px;"))
#用正则表达式再一次进行筛选
p = r'src="([^"]+\.jpg)"'
imglist = re.findall(p,img)
return imglist
def download(folder,pages,qh):
#把目录改变为需要保存的文件夹
os.chdir(folder)
x = 0
url = r'http://www.nenml.com/12/%d'%qh
page_num = int(pages)
#下载每一页的内容
for i in range(pages):
page_num -= 1
page_url = url + '?page=' + str(page_num)
imglist = get_imgs(url_open(page_url))
for each in imglist:
filename = str(x) +'.jpg'
x += 1
with open(filename, 'wb') as f:
img = url_open(each)
f.write(img)
g.msgbox('下载成功!')
#如果当成模块导入将不会执行以下内容
if__name__ == '__main__'
folder = g.diropenbox()
pages = g.integerbox('请输入需要下载的页数:(大于1小于10!)',lowerbound=1,upperbound = 10)
qh = g.integerbox('请输入需要下载的刊期:',lowerbound=1,upperbound = 10000)
download(folder,pages,qh)