爬html资源,简单实践。
import urllib.request
from bs4 import BeautifulSoup
import re
#import requests
# urls[]
url = 'https://www.thepaper.cn/'
binHtml = urllib.request.urlopen(url) # 打开url获取html
html = BeautifulSoup(binHtml)
#<img src="https://imagecloud.thepaper.cn/thepaper/image/123/253/812.jpg"/>
REG_URL = r'<img.*?src="(.*?)"/>' # 过滤
regUrl = re.compile(REG_URL)
listTarget = regUrl.findall(str(html))
for link in listTarget:
print(link)
# 组合成新的url,发起request
# if not a url ... do something
# 下载方式一
#src = request.get(link)
#if src.statuc_code == '200':
# with open("./pic","wb") as f:
# f.write(src.content)
# 下载方式二
#src = urllib.request.urlopen(link)
#if src is None or response.status != 200:
# print('请求异常')
# continue
#with open("./pic","wb") as f:
# f.write(src.read())