爬取Penn-Fudan的行人检测和分割数据集。Penn-Fudan数据集中有170张图像,包含345个行人的实例。
爬取链接:https://www.cis.upenn.edu/~jshi/ped_html/
方法较笨,后续改善。
import requests
from bs4 import BeautifulSoup
def getHtmlText(url):
try:
print("craw html:",url)
r = requests.get(url)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return "error"
def getImageList(url):
html = getHtmlText(url)
soup = BeautifulSoup(html,'html.parser')
img_a = soup.find_all('a')
datas = []
for img in img_a:
try:
href_txt = img.attrs['href']
hrefs = "https://www.cis.upenn.edu/~jshi/ped_html/" + href_txt
datas.append(hrefs)
except:
continue
for data in datas:
try:
filename = data.split('/')
img_data = requests.get(data)
with open(root_path + '\{}'.format(filename[-1]),"wb")as f:
f.write(img_data.content)
except:
continue
if __name__ == "__main__":
root_path = input('请输入图片要存放的地址:')
for idx in range(6):
url = f"https://www.cis.upenn.edu/~jshi/ped_html/pageshow{idx+1}.html"
getHtmlText(url)
getImageList(url)