Python小爬虫

Python小爬虫


该脚本可以分析HTML并抓取网络图片,对学习网络爬虫有一定帮助。该脚本基于python3以及其第三方库requests和BeautifulSoup。主要原理是使用requests库下载网页内容,再使用BeautifulSoup分析网页内容,获取指定的图片标签及其连接,最后使用requests下载图片。

准备环境

基于Python3和Ubuntu,在windows及其它平台同样支持,正确下载相关的第三方库即可。

Install pip

sudo apt-get install python3-pip

Install requests

This library is used to get html content.

pip3 install requests

Install BeautifulSoup

This library is used to parse html content, it's very powerful.

pip3 install bs4

运行脚本

Run command:

python download_pic3.py

源码

download_pic3.py

import logging
import os
import re
from logging import FileHandler, StreamHandler
from urllib import parse

import requests
from bs4 import BeautifulSoup

# Define base directory to store downloaded images and log file
BASE_DIR = os.path.dirname(os.path.abspath('__file__'))

MAIN_URL = "http://www.netbian.com/"

# Logger object
logger = logging.getLogger('download_images_logger')


class ImageCategory:
    """
    Image category class, used to store each category
    """

    def __init__(self, title, url, pages):
        self.title = title
        self.url = url
        self.pages = pages


class ImageItem:
    """
    Image item class, used to parse sub image urls and download each image
    """

    def __init__(self, title, main_url, local_dir):
        self.title = title
        self.main_url = main_url
        self.local_dir = local_dir
        self.__sub_urls = []

    def __parse_sub_urls(self):
        logger.info("Parsing image urls...")
        soup = BeautifulSoup(get_html_content(self.main_url), 'html.parser')
        images = soup.select('.pic img')
        for image in images:
            self.__sub_urls.append(image['src'])

    def __download_all_images(self):
        logger.info("Downloading images...")
        for url in self.__sub_urls:
            download_image(url, self.local_dir)

    def process_images(self):
        logger.info(
            "Parse image item: " + self.title + ", main url: " + self.main_url + ", local dir: " + self.local_dir)
        try:
            self.__parse_sub_urls()
            self.__download_all_images()
        except Exception as e:
            logger.error("Failed to parse image, main url: " + self.main_url)
            logger.error(str(e))


def download_image(url, local_dir):
    """
    Download a image file
    :param url: URL
    :param local_dir: Local directory to store image file
    :return: None
    """

    name = os.path.basename(url)
    local_image_path = os.path.join(local_dir, name)

    # Local image exists
    if os.path.exists(local_image_path):
        logger.debug("Image file exists, no need to download, %s" % local_image_path)
        return

    # Local image doesn't exist
    response = None
    retry_times = 3
    for i in range(retry_times):
        try:
            logger.debug("Downloading image retry times #%d: %s" % ((i + 1), url))
            response = requests.get(url, headers=get_request_header(), stream=True, timeout=30)
            logger.debug("Status code: " + str(response.status_code))
            if response.status_code == requests.codes.ok:
                with open(local_image_path, 'wb') as f:
                    logger.debug("Getting image content...")
                    content = response.content
                    logger.debug("Writing image file: %s" % local_image_path)
                    f.write(content)
                logger.debug("Downloaded: " + url)
                break
            else:
                logger.error("Cannot download image: " + url + ", return code: " + str(response.status_code))
        except Exception as e:
            logger.error("Exception occurred while download images")
            logger.error(str(e))
        finally:
            if response is not None:
                response.close()


def get_html_content(url):
    """
    Get html result
    :param url: URL
    :return: result
    """
    result = ''
    response = None
    retry_times = 3
    for i in range(retry_times):
        try:
            logger.debug("Getting html content retry times #%d, %s" % ((i + 1), url))
            response = requests.get(url, headers=get_request_header(), timeout=30)
            if response.status_code == requests.codes.ok:
                result = response.content
                break
            else:
                logger.error("Failed to get html from: " + url + ", return code: " + str(response.status_code))
        except Exception as e:
            logger.error("Exception occurred while getting html result")
            logger.error(str(e))
        finally:
            if response is not None:
                response.close()
    return result


def get_category_pages(url):
    try:
        soup = BeautifulSoup(get_html_content(url), 'html.parser')
        page = soup.select('.page')
        pattern = re.compile(r'…(\d+)')
        result = re.findall(pattern, page[0].get_text())
        if len(result) > 0:
            return result[0]
        else:
            logger.warning("Cannot get category pages, %s" % url)
            return 0
    except Exception as e:
        logger.error("Failed to parse url: " + url)
        logger.error(str(e))


def parse_category_page(category):
    """
    Parse category page
    :param category: category
    :return: None
    """

    # Check category directory
    logger.info("Parsing category: " + category.title + ", url: " + category.url)
    category_path = os.path.join(BASE_DIR, 'images', category.title)
    if not os.path.exists(category_path):
        logger.info("Making dir: " + category_path)
        os.makedirs(category_path)

    # Parse each page of current category
    for i in range(1, category.pages + 1):
        page_name = ''
        if i > 1:
            page_name = "index_" + str(i) + ".htm"

        # Find all image links in current page
        page_url = os.path.join(category.url, page_name)
        logger.info("Page url: " + page_url)
        try:
            soup = BeautifulSoup(get_html_content(page_url), 'html.parser')
            image_links = soup.select('.list a')
            for link in image_links:

                # Skip image with url "http://"
                if 'http://' in link['href']:
                    continue

                # Title is aways empty, no need here
                image = ImageItem('', parse.urljoin(MAIN_URL, link['href']), category_path)
                image.process_images()
        except Exception as e:
            logger.error("Failed to parse url: " + page_url)
            logger.error(str(e))


def get_request_header():
    headers = {
        "user-agent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) '
                      'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
    }
    return headers


def get_categories():
    """
    Get all categories.
    :return: all categories
    """
    categories = []

    # Get all categories from main page
    soup = BeautifulSoup(get_html_content(MAIN_URL), 'html.parser')
    categories_links = soup.select('.cate a')
    for ca_link in categories_links:

        # Skip category which url is http://....
        if 'http://' in ca_link['href']:
            continue
        category_url = parse.urljoin(MAIN_URL, ca_link['href'])
        category_pages = get_category_pages(category_url)
        category = ImageCategory(ca_link.get_text(), category_url, int(category_pages))
        logger.debug("Category title: %s, url: %s, pages: %s" % (category.title, category.url, category.pages))
        categories.append(category)

    return categories


def init_logger():
    """
    Init logger
    :return: None
    """
    logger.setLevel(logging.DEBUG)

    logfile = FileHandler(os.path.join(BASE_DIR, 'main3.log'))
    console = StreamHandler()

    logfile.setLevel(logging.DEBUG)
    console.setLevel(logging.INFO)

    formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
    logfile.setFormatter(fmt=formatter)
    console.setFormatter(fmt=formatter)

    logger.addHandler(logfile)
    logger.addHandler(console)


def start_all():
    # Create base dir if doesn't exist
    if not os.path.exists(BASE_DIR):
        os.mkdir(BASE_DIR)
    init_logger()
    logger.info("Starting main...")
    categories = get_categories()
    for category in categories:
        parse_category_page(category)
    logger.info("Finished")


if __name__ == '__main__':
    start_all()

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值