使用Python爬虫爬取贴吧图片数据

最新推荐文章于 2024-05-02 21:47:38 发布

m_buddy

最新推荐文章于 2024-05-02 21:47:38 发布

阅读量1k

点赞数

分类专栏： Python相关

本文链接：https://blog.csdn.net/m_buddy/article/details/55189449

版权

Python相关专栏收录该内容

29 篇文章 4 订阅

订阅专栏

1. 概述

最近刚接触Python爬虫，通过所学及了解这里写一篇文章记录爬取网络图像数据。程序的大体思想是这样：首先爬虫直接打开一个网页（该网页不对爬虫抵制），使用bs4解析网页，之后查找标签为img的结点寻找图片的原链接地址，之后便使用这个链接地址将网络上的图片下载下来。

2. 实现

# -*- coding=utf-8 -*-
import urllib2 as url_lib    # 网络库
import urllib as url
import re                   # 正则表达式
import bs4 as BS4           #
import os


def GetHtml(url_str):
    if None is url_str:
        print("url_str is null")
        return None
    else:
        try:
            my_html = url_lib.urlopen(url_str)
        except url_lib.HTTPError as ex:
            print(ex)
        return my_html.read()


def GetHref(html, isshow_href = False):
    if None is not html:
        href_re_str = "^(http://).+"
        href_re = re.compile(href_re_str)
        html = BS4.BeautifulSoup(html)
        href_list = html.findAll("a", {"href": href_re})
        print("href_list count:%d" % len(href_list))
        if isshow_href:
            for item in href_list:
                print("href_url: %s" % item["href"])

        return href_list


# 获取下载文件的绝对路径
def GetAbsoluteUrl(baseUrl, sourceUrl):
    if sourceUrl.startswith("http://www."):  # url链接的标准化和清洗
        url = "http://" + sourceUrl[11:]
    elif sourceUrl.startswith("www."):
        url = sourceUrl[4:]
        url = "http://" + url
    elif sourceUrl.startswith("http://"):
        return sourceUrl
    else:
        url = baseUrl + "/" + sourceUrl

    if baseUrl not in url:  # 去除盗链
        return None

    return url

# the main function
url_str = "http://tieba.baidu.com/p/4966422758"
html = GetHtml(url_str)
if None is not html:
    # print(html)
    # OuterLink = GetHref(html, True)   # 获取所有的外链链接
    try:
        html = BS4.BeautifulSoup(html)
    except AttributeError as ex:
        exit(-1)
    img_list = html.findAll("img", {"src": re.compile("\w+.((jpg)|(jpeg)|(bmp))$")})
    print("img_list count_num is: %d \n" % len(img_list))
    img_count = 0
    download_dir = 'e:\Release\my_temp'
    if not os.path.exists(download_dir):
        os.makedirs(download_dir)
    for item in img_list:
        img_url = item["src"]
        img_url = GetAbsoluteUrl(url_str, img_url)
        if None is not img_url:
            print("img tag is: %s " % img_url)
            dir_temp = download_dir
            dir_temp += '\%s.jpg' % img_count
            url.urlretrieve(img_url, dir_temp)
            img_count += 1