爬虫基础(10)

  • 目录
    1.函数式编程
    2.OOP编程
  • 函数式
# -*- encoding: utf-8 -*-
'''
- F12,分析动态网页,获取图片地址
- json解析获取图片链接
- 下载图片并保存至本地
'''


import requests, json, os


def getHtml(url):
    try:
        req = requests.get(url, headers={'user-agent':'Mozilla/5.0'}, timeout=10)
        req.raise_for_status()
        return req
    except Exception as e:
        print("getHtml产生异常:{}".format(e))

def downloads(tups):
    for url, name in tups:
        try:
            pic = getHtml(url).content
            with open("./{}.jpg".format(name), 'wb') as f:
                f.write(pic)
        except Exception as e:
            print("图片无法下载:{}".format(e))
            continue

def main():
    path = r'./picturesDownload'
    if not os.path.exists(path):
        os.makedirs(path)
    pwd = os.getcwd()
    os.chdir(path)
    for num in range(0, 1000, 20):
        jsUrl = 'https://www.douban.com/j/search_photo?q=王祖贤&limit=20&start={}'
        html = getHtml(jsUrl.format(num)).text
        lis = json.loads(html).get('images')
        tups = map(lambda x: (x.get('src'), x.get('id')), lis)
        downloads(tups)
    os.chdir(pwd)

main()
  • 面向对象
# -*- coding: utf-8 -*-

import requests
import json
import os


class spider(object):

    def __init__(self):
        self.user_agent = {'user-agent':'Mozilla/5.0'}
        self.url = "https://www.douban.com/j/search_photo?q={}&limit=20&start={}"
        self.name = "王祖贤"
        self.dir = r'./20190622/{}.jpg'

    def getHtmlText(self, url):
        try:
            req = requests.get(url, headers=self.user_agent, timeout=10)
            req.raise_for_status()
            return req.text
        except Exception as e:
            print("getHtmlText产生异常:{}".format(e))

    def parserHtmlForUrl(self, html):
        lis = json.loads(html, encoding='utf-8').get('images')
        Urls = map(lambda x: (x.get('src'), x.get('id')), lis)
        return Urls

    def printImage(self, urls):
        path = r'./20190622'
        if not os.path.exists(path):
            os.makedirs(path)
        try:
            for url, name in urls:
                pic = requests.get(url, headers=self.user_agent, timeout=10)
                print(name)
                with open(self.dir.format(name), 'wb') as f:
                    f.write(pic.content)
        except Exception as e:
            print('图片无法下载:{}'.format(e))

if __name__ == '__main__':
    sp = spider()
    for num in range(0, 40, 20):
        html = sp.getHtmlText(sp.url.format(sp.name, num))
        urls = sp.parserHtmlForUrl(html)
        sp.printImage(urls)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值