Python小爬虫

最新推荐文章于 2023-05-15 10:10:14 发布

尘世中迷途小码农

最新推荐文章于 2023-05-15 10:10:14 发布

阅读量242

点赞数

分类专栏： python 文章标签： python

本文链接：https://blog.csdn.net/funnyrand/article/details/106716104

版权

python 专栏收录该内容

29 篇文章 0 订阅

订阅专栏

Python小爬虫

该脚本可以分析HTML并抓取网络图片，对学习网络爬虫有一定帮助。该脚本基于python3以及其第三方库requests和BeautifulSoup。主要原理是使用requests库下载网页内容，再使用BeautifulSoup分析网页内容，获取指定的图片标签及其连接，最后使用requests下载图片。

准备环境

基于Python3和Ubuntu，在windows及其它平台同样支持，正确下载相关的第三方库即可。

Install pip

sudo apt-get install python3-pip

Install requests

This library is used to get html content.

pip3 install requests

Install BeautifulSoup

This library is used to parse html content, it's very powerful.

pip3 install bs4

运行脚本

Run command:

python download_pic3.py

源码

download_pic3.py

import logging
import os
import re
from logging import FileHandler, StreamHandler
from urllib import parse

import requests
from bs4 import BeautifulSoup

# Define base directory to store downloaded images and log file
BASE_DIR = os.path.dirname(os.path.abspath('__file__'))

MAIN_URL = "http://www.netbian.com/"

# Logger object
logger = logging.getLogger('download_images_logger')


class ImageCategory:
    """
    Image category class, used to store each category
    """

    def __init__(self, title, url, pages):
        self.title = title
        self.url = url
        self.pages = pages


class ImageItem:
    """
    Image item class, used to parse sub image urls and download each image
    """

    def __init__(self, title, main_url, local_dir):
        self.title = title
        self.main_url = main_url
        self.local_dir = local_dir
        self.__sub_urls = []

    def __parse_sub_urls(self):
        logger.info("Parsing image urls...")
        soup = BeautifulSoup(get_html_content(self.main_url), 'html.parser')
        images = soup.select('.pic img')
        for image in images:
            self.__sub_urls.append(image['src'])

    def __download_all_images(self):
        logger.info("Downloading images...")
        for url in self.__sub_urls:
            download_image(url, self.local_dir)

    def process_images(self):
        logger.info(
            "Parse image item: " + self.title + ", main url: " + self.main_url + ", local dir: " + self.local_dir)
        try:
            self.__parse_sub_urls()
            self.__download_all_images()
        except Exception as e:
            logger.error("Failed to parse image, main url: " + self.main_url)
            logger.error(str(e))


def download_image(url, local_dir):
    """
    Download a image file
    :param url: URL
    :param local_dir: Local directory to store image file
    :return: None
    """

    name = os.path.basename(url)
    local_image_path = os.path.join(local_dir, name)

    # Local image exists
    if os.path.exists(local_image_path):
        logger.debug("Image file exists, no need to download, %s" % local_image_path)
        return

    # Local image doesn't exist
    response = None
    retry_times = 3
    for i in range(retry_times):
        try:
            logger.debug("Downloading image retry times #%d: %s" % ((i + 1), url))
            response = requests.get(url, headers=get_request_header(), stream=True, timeout=30)
            logger.debug("Status code: " + str(response.status_code))
            if response.status_code == requests.codes.ok:
                with open(local_image_path, 'wb') as f:
                    logger.debug("Getting image content...")
                    content = response.content
                    logger.debug("Writing image file: %s" % local_image_path)
                    f.write(content)
                logger.debug("Downloaded: " + url)
                break
            else:
                logger.error("Cannot download image: " + url + ", return code: " + str(response.status_code))
        except Exception as e:
            logger.error("Exception occurred while download images")
            logger.error(str(e))
        finally:
            if response is not None:
                response.close()


def get_html_content(url):
    """
    Get html result
    :param url: URL
    :return: result
    """
    result = ''
    response = None
    retry_times = 3
    for i in range(retry_times):
        try:
            logger.debug("Getting html content retry times #%d, %s" % ((i + 1), url))
            response = requests.get(url, headers=get_request_header(), timeout=30)
            if response.status_code == requests.codes.ok:
                result = response.content
                break
            else:
                logger.error("Failed to get html from: " + url + ", return code: " + str(response.status_code))
        except Exception as e:
            logger.error("Exception occurred while getting html result")
            logger.error(str(e))
        finally:
            if response is not None:
                response.close()
    return result


def get_category_pages(url):
    try:
        soup = BeautifulSoup(get_html_content(url), 'html.parser')
        page = soup.select('.page')
        pattern = re.compile(r'…(\d+)')
        result = re.findall(pattern, page[0].get_text())
        if len(result) > 0:
            return result[0]
        else:
            logger.warning("Cannot get category pages, %s" % url)
            return 0
    except Exception as e:
        logger.error("Failed to parse url: " + url)
        logger.error(str(e))


def parse_category_page(category):
    """
    Parse category page
    :param category: category
    :return: None
    """

    # Check category directory
    logger.info("Parsing category: " + category.title + ", url: " + category.url)
    category_path = os.path.join(BASE_DIR, 'images', category.title)
    if not os.path.exists(category_path):
        logger.info("Making dir: " + category_path)
        os.makedirs(category_path)

    # Parse each page of current category
    for i in range(1, category.pages + 1):
        page_name = ''
        if i > 1:
            page_name = "index_" + str(i) + ".htm"

        # Find all image links in current page
        page_url = os.path.join(category.url, page_name)
        logger.info("Page url: " + page_url)
        try:
            soup = BeautifulSoup(get_html_content(page_url), 'html.parser')
            image_links = soup.select('.list a')
            for link in image_links:

                # Skip image with url "http://"
                if 'http://' in link['href']:
                    continue

                # Title is aways empty, no need here
                image = ImageItem('', parse.urljoin(MAIN_URL, link['href']), category_path)
                image.process_images()
        except Exception as e:
            logger.error("Failed to parse url: " + page_url)
            logger.error(str(e))


def get_request_header():
    headers = {
        "user-agent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) '
                      'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
    }
    return headers


def get_categories():
    """
    Get all categories.
    :return: all categories
    """
    categories = []

    # Get all categories from main page
    soup = BeautifulSoup(get_html_content(MAIN_URL), 'html.parser')
    categories_links = soup.select('.cate a')
    for ca_link in categories_links:

        # Skip category which url is http://....
        if 'http://' in ca_link['href']:
            continue
        category_url = parse.urljoin(MAIN_URL, ca_link['href'])
        category_pages = get_category_pages(category_url)
        category = ImageCategory(ca_link.get_text(), category_url, int(category_pages))
        logger.debug("Category title: %s, url: %s, pages: %s" % (category.title, category.url, category.pages))
        categories.append(category)

    return categories


def init_logger():
    """
    Init logger
    :return: None
    """
    logger.setLevel(logging.DEBUG)

    logfile = FileHandler(os.path.join(BASE_DIR, 'main3.log'))
    console = StreamHandler()

    logfile.setLevel(logging.DEBUG)
    console.setLevel(logging.INFO)

    formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
    logfile.setFormatter(fmt=formatter)
    console.setFormatter(fmt=formatter)

    logger.addHandler(logfile)
    logger.addHandler(console)


def start_all():
    # Create base dir if doesn't exist
    if not os.path.exists(BASE_DIR):
        os.mkdir(BASE_DIR)
    init_logger()
    logger.info("Starting main...")
    categories = get_categories()
    for category in categories:
        parse_category_page(category)
    logger.info("Finished")


if __name__ == '__main__':
    start_all()

尘世中迷途小码农

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Python小爬虫

Python小爬虫该脚本可以分析HTML并抓取网络图片，对学习网络爬虫有一定帮助。该脚本基于python3以及其第三方库requests和BeautifulSoup。主要原理是使用requests库下载网页内容，再使用BeautifulSoup分析网页内容，获取指定的图片标签及其连接，最后使用requests下载图片。准备环境基于Python3和Ubuntu，在windows及其它平台同样支持，正确下载相关的第三方库即可。Install pipsudo apt-get install p
复制链接

扫一扫

专栏目录