爬虫基础（10）

最新推荐文章于 2024-07-17 23:50:36 发布

Fergus awsl

最新推荐文章于 2024-07-17 23:50:36 发布

阅读量101

点赞数

分类专栏：数据分析

本文链接：https://blog.csdn.net/weixin_43650411/article/details/93260193

版权

数据分析专栏收录该内容

41 篇文章 2 订阅

订阅专栏

目录
1.函数式编程
2.OOP编程
函数式

# -*- encoding: utf-8 -*-
'''
- F12,分析动态网页，获取图片地址
- json解析获取图片链接
- 下载图片并保存至本地
'''


import requests, json, os


def getHtml(url):
    try:
        req = requests.get(url, headers={'user-agent':'Mozilla/5.0'}, timeout=10)
        req.raise_for_status()
        return req
    except Exception as e:
        print("getHtml产生异常：{}".format(e))

def downloads(tups):
    for url, name in tups:
        try:
            pic = getHtml(url).content
            with open("./{}.jpg".format(name), 'wb') as f:
                f.write(pic)
        except Exception as e:
            print("图片无法下载：{}".format(e))
            continue

def main():
    path = r'./picturesDownload'
    if not os.path.exists(path):
        os.makedirs(path)
    pwd = os.getcwd()
    os.chdir(path)
    for num in range(0, 1000, 20):
        jsUrl = 'https://www.douban.com/j/search_photo?q=王祖贤&limit=20&start={}'
        html = getHtml(jsUrl.format(num)).text
        lis = json.loads(html).get('images')
        tups = map(lambda x: (x.get('src'), x.get('id')), lis)
        downloads(tups)
    os.chdir(pwd)

main()

面向对象

# -*- coding: utf-8 -*-

import requests
import json
import os


class spider(object):

    def __init__(self):
        self.user_agent = {'user-agent':'Mozilla/5.0'}
        self.url = "https://www.douban.com/j/search_photo?q={}&limit=20&start={}"
        self.name = "王祖贤"
        self.dir = r'./20190622/{}.jpg'

    def getHtmlText(self, url):
        try:
            req = requests.get(url, headers=self.user_agent, timeout=10)
            req.raise_for_status()
            return req.text
        except Exception as e:
            print("getHtmlText产生异常：{}".format(e))

    def parserHtmlForUrl(self, html):
        lis = json.loads(html, encoding='utf-8').get('images')
        Urls = map(lambda x: (x.get('src'), x.get('id')), lis)
        return Urls

    def printImage(self, urls):
        path = r'./20190622'
        if not os.path.exists(path):
            os.makedirs(path)
        try:
            for url, name in urls:
                pic = requests.get(url, headers=self.user_agent, timeout=10)
                print(name)
                with open(self.dir.format(name), 'wb') as f:
                    f.write(pic.content)
        except Exception as e:
            print('图片无法下载：{}'.format(e))

if __name__ == '__main__':
    sp = spider()
    for num in range(0, 40, 20):
        html = sp.getHtmlText(sp.url.format(sp.name, num))
        urls = sp.parserHtmlForUrl(html)
        sp.printImage(urls)