1. 概述
最近刚接触Python爬虫,通过所学及了解这里写一篇文章记录爬取网络图像数据。程序的大体思想是这样:首先爬虫直接打开一个网页(该网页不对爬虫抵制),使用bs4解析网页,之后查找标签为img的结点寻找图片的原链接地址,之后便使用这个链接地址将网络上的图片下载下来。
2. 实现
# -*- coding=utf-8 -*-
import urllib2 as url_lib # 网络库
import urllib as url
import re # 正则表达式
import bs4 as BS4 #
import os
def GetHtml(url_str):
if None is url_str:
print("url_str is null")
return None
else:
try:
my_html = url_lib.urlopen(url_str)
except url_lib.HTTPError as ex:
print(ex)
return my_html.read()
def GetHref(html, isshow_href = False):
if None is not html:
href_re_str = "^(http://).+"
href_re = re.compile(href_re_str)
html = BS4.BeautifulSoup(html)
href_list = html.findAll("a", {"href": href_re})
print("href_list count:%d" % len(href_list))
if isshow_href:
for item in href_list:
print("href_url: %s" % item["href"])
return href_list
# 获取下载文件的绝对路径
def GetAbsoluteUrl(baseUrl, sourceUrl):
if sourceUrl.startswith("http://www."): # url链接的标准化和清洗
url = "http://" + sourceUrl[11:]
elif sourceUrl.startswith("www."):
url = sourceUrl[4:]
url = "http://" + url
elif sourceUrl.startswith("http://"):
return sourceUrl
else:
url = baseUrl + "/" + sourceUrl
if baseUrl not in url: # 去除盗链
return None
return url
# the main function
url_str = "http://tieba.baidu.com/p/4966422758"
html = GetHtml(url_str)
if None is not html:
# print(html)
# OuterLink = GetHref(html, True) # 获取所有的外链链接
try:
html = BS4.BeautifulSoup(html)
except AttributeError as ex:
exit(-1)
img_list = html.findAll("img", {"src": re.compile("\w+.((jpg)|(jpeg)|(bmp))$")})
print("img_list count_num is: %d \n" % len(img_list))
img_count = 0
download_dir = 'e:\Release\my_temp'
if not os.path.exists(download_dir):
os.makedirs(download_dir)
for item in img_list:
img_url = item["src"]
img_url = GetAbsoluteUrl(url_str, img_url)
if None is not img_url:
print("img tag is: %s " % img_url)
dir_temp = download_dir
dir_temp += '\%s.jpg' % img_count
url.urlretrieve(img_url, dir_temp)
img_count += 1
3. 结果
原网页上面的图像:
下载下来的图像: