Python小爬虫
该脚本可以分析HTML并抓取网络图片,对学习网络爬虫有一定帮助。该脚本基于python3以及其第三方库requests和BeautifulSoup。主要原理是使用requests库下载网页内容,再使用BeautifulSoup分析网页内容,获取指定的图片标签及其连接,最后使用requests下载图片。
准备环境
基于Python3和Ubuntu,在windows及其它平台同样支持,正确下载相关的第三方库即可。
Install pip
sudo apt-get install python3-pip
Install requests
This library is used to get html content.
pip3 install requests
Install BeautifulSoup
This library is used to parse html content, it's very powerful.
pip3 install bs4
运行脚本
Run command:
python download_pic3.py
源码
download_pic3.py
import logging
import os
import re
from logging import FileHandler, StreamHandler
from urllib import parse
import requests
from bs4 import BeautifulSoup
# Define base directory to store downloaded images and log file
BASE_DIR = os.path.dirname(os.path.abspath('__file__'))
MAIN_URL = "http://www.netbian.com/"
# Logger object
logger = logging.getLogger('download_images_logger')
class ImageCategory:
"""
Image category class, used to store each category
"""
def __init__(self, title, url, pages):
self.title = title
self.url = url
self.pages = pages
class ImageItem:
"""
Image item class, used to parse sub image urls and download each image
"""
def __init__(self, title, main_url, local_dir):
self.title = title
self.main_url = main_url
self.local_dir = local_dir
self.__sub_urls = []
def __parse_sub_urls(self):
logger.info("Parsing image urls...")
soup = BeautifulSoup(get_html_content(self.main_url), 'html.parser')
images = soup.select('.pic img')
for image in images:
self.__sub_urls.append(image['src'])
def __download_all_images(self):
logger.info("Downloading images...")
for url in self.__sub_urls:
download_image(url, self.local_dir)
def process_images(self):
logger.info(
"Parse image item: " + self.title + ", main url: " + self.main_url + ", local dir: " + self.local_dir)
try:
self.__parse_sub_urls()
self.__download_all_images()
except Exception as e:
logger.error("Failed to parse image, main url: " + self.main_url)
logger.error(str(e))
def download_image(url, local_dir):
"""
Download a image file
:param url: URL
:param local_dir: Local directory to store image file
:return: None
"""
name = os.path.basename(url)
local_image_path = os.path.join(local_dir, name)
# Local image exists
if os.path.exists(local_image_path):
logger.debug("Image file exists, no need to download, %s" % local_image_path)
return
# Local image doesn't exist
response = None
retry_times = 3
for i in range(retry_times):
try:
logger.debug("Downloading image retry times #%d: %s" % ((i + 1), url))
response = requests.get(url, headers=get_request_header(), stream=True, timeout=30)
logger.debug("Status code: " + str(response.status_code))
if response.status_code == requests.codes.ok:
with open(local_image_path, 'wb') as f:
logger.debug("Getting image content...")
content = response.content
logger.debug("Writing image file: %s" % local_image_path)
f.write(content)
logger.debug("Downloaded: " + url)
break
else:
logger.error("Cannot download image: " + url + ", return code: " + str(response.status_code))
except Exception as e:
logger.error("Exception occurred while download images")
logger.error(str(e))
finally:
if response is not None:
response.close()
def get_html_content(url):
"""
Get html result
:param url: URL
:return: result
"""
result = ''
response = None
retry_times = 3
for i in range(retry_times):
try:
logger.debug("Getting html content retry times #%d, %s" % ((i + 1), url))
response = requests.get(url, headers=get_request_header(), timeout=30)
if response.status_code == requests.codes.ok:
result = response.content
break
else:
logger.error("Failed to get html from: " + url + ", return code: " + str(response.status_code))
except Exception as e:
logger.error("Exception occurred while getting html result")
logger.error(str(e))
finally:
if response is not None:
response.close()
return result
def get_category_pages(url):
try:
soup = BeautifulSoup(get_html_content(url), 'html.parser')
page = soup.select('.page')
pattern = re.compile(r'…(\d+)')
result = re.findall(pattern, page[0].get_text())
if len(result) > 0:
return result[0]
else:
logger.warning("Cannot get category pages, %s" % url)
return 0
except Exception as e:
logger.error("Failed to parse url: " + url)
logger.error(str(e))
def parse_category_page(category):
"""
Parse category page
:param category: category
:return: None
"""
# Check category directory
logger.info("Parsing category: " + category.title + ", url: " + category.url)
category_path = os.path.join(BASE_DIR, 'images', category.title)
if not os.path.exists(category_path):
logger.info("Making dir: " + category_path)
os.makedirs(category_path)
# Parse each page of current category
for i in range(1, category.pages + 1):
page_name = ''
if i > 1:
page_name = "index_" + str(i) + ".htm"
# Find all image links in current page
page_url = os.path.join(category.url, page_name)
logger.info("Page url: " + page_url)
try:
soup = BeautifulSoup(get_html_content(page_url), 'html.parser')
image_links = soup.select('.list a')
for link in image_links:
# Skip image with url "http://"
if 'http://' in link['href']:
continue
# Title is aways empty, no need here
image = ImageItem('', parse.urljoin(MAIN_URL, link['href']), category_path)
image.process_images()
except Exception as e:
logger.error("Failed to parse url: " + page_url)
logger.error(str(e))
def get_request_header():
headers = {
"user-agent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
}
return headers
def get_categories():
"""
Get all categories.
:return: all categories
"""
categories = []
# Get all categories from main page
soup = BeautifulSoup(get_html_content(MAIN_URL), 'html.parser')
categories_links = soup.select('.cate a')
for ca_link in categories_links:
# Skip category which url is http://....
if 'http://' in ca_link['href']:
continue
category_url = parse.urljoin(MAIN_URL, ca_link['href'])
category_pages = get_category_pages(category_url)
category = ImageCategory(ca_link.get_text(), category_url, int(category_pages))
logger.debug("Category title: %s, url: %s, pages: %s" % (category.title, category.url, category.pages))
categories.append(category)
return categories
def init_logger():
"""
Init logger
:return: None
"""
logger.setLevel(logging.DEBUG)
logfile = FileHandler(os.path.join(BASE_DIR, 'main3.log'))
console = StreamHandler()
logfile.setLevel(logging.DEBUG)
console.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
logfile.setFormatter(fmt=formatter)
console.setFormatter(fmt=formatter)
logger.addHandler(logfile)
logger.addHandler(console)
def start_all():
# Create base dir if doesn't exist
if not os.path.exists(BASE_DIR):
os.mkdir(BASE_DIR)
init_logger()
logger.info("Starting main...")
categories = get_categories()
for category in categories:
parse_category_page(category)
logger.info("Finished")
if __name__ == '__main__':
start_all()