全是干货 python 爬虫实战爬取今日头条街拍美女(json js解析map映射类方法参数传递 pymongo使用)

最新推荐文章于 2021-07-05 16:31:10 发布

一心要爆肝的浩浩

最新推荐文章于 2021-07-05 16:31:10 发布

阅读量462

点赞数 1

文章标签：爬虫实战 json数据解析 js请求处理 map函数映射 NOSQL的使用

本文链接：https://blog.csdn.net/weixin_42336559/article/details/80781662

版权

import requests, re, json, pymongo, os
from urllib.parse import urlencode
#引入md5加密函数
from hashlib import md5
#引入多进程模块中的进程池
from multiprocessing import Pool

#os: 用来操作本地文件或文件夹的模块
#json: 用来解析json数据的模块
#NoSQL中的数据库和表不需要提前创建,值需要配置会自动创建
MONGO_HOST = 'localhost'
MOGO_DB = 'jiepai'
MONGO_TABLE = 'jiepai'

class JiePaiSpider(object):
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0'
        }
        # 创建数据库的连接客户端
        self.client = pymongo.MongoClient(MONGO_HOST)
        # 根据客户端对象, 连接数据库
        self.db = self.client[MOGO_DB]

    def get_page_list(self, offset):
        """
        获取列表页源码
        :param offset: 数据返回的偏移量 第一页:0 第二页:20
        :return:
        """
        #准备参数:拼接网址
        params = {
            "autoload": "true",
            "count": 20,
            "cur_tab": 3,
            "format": "json",
            "from": "gallery",
            "keyword": "街拍",
            "offset": offset
        }

        # https://www.toutiao.com/search_content/?offset=20&format=json&keyword=街拍&autoload=true&count=20&cur_tab=3&from=gallery
        url = 'https://www.toutiao.com/search_content/?' + urlencode(params)

        #请求列表页
        try:
            response = requests.get(url, headers=self.headers)
            #status_code:requests库中的属性, 获取状态码.
            if response.status_code == 200:
                return response.text
            else:
                return None
        except Exception as e:
            print('请求列表页异常', e, url)
            return None

    def parse_page_list(self, json_data):
        """解析JSON数据的函数(网页搜索在线json解释器将上一步骤获取的
        json_data(response.text)编译,
        并按照编译后的内容继续操作)"""
        json_obj = json.loads(json_data)
        if json_obj and 'data' in json_obj.keys():
            data_list = json_obj.get('data')
            data_list = json_obj.get('data')
            # r = [] 内存中储存的
            for item in data_list:
                #yield: 好处:1,不会将所有数据提出来存入内存中;而是返回了一个对象;可以通过对象获取数据;用多少取多少, 可以节省内容空间.2,除了能返回一个值,还不会终止循环的运行
                yield item.get('article_url')

    def get_page_detail(self, detail_url):
        """
        解析详情页
        :param detail_url: 详情页面的地址
        :return: HTML源代码
        """
        #请求详情页
        try:
            response = requests.get(detail_url, headers=self.headers)
            if response.status_code == 200:
                return response.text
            else:
                return None
        except Exception as e:
            print('请求详情页异常:', e, detail_url)
            return None

    def parse_page_detail(self, html):
        """
        解析详情页, 从js代码中提取json数据
        :param html:
        :return:
        """
        json_str = re.findall(re.compile(r'gallery: JSON.parse(.*),'), html)
        if json_str :
            json_obj = json.loads(json_str[0].strip('(|)').replace('\\', '').strip('"'))
            if json_obj and 'sub_images' in json_obj.keys():
                images_list = [item.get('url')for item in json_obj['sub_images']]
                #下载图片
                for image_url in images_list :
                    self.save_image(image_url)

            # 将json数据保存到mongodb中
            self.save_to_mongodb(json_obj)

        else:
            print('没有找到js中的json字符串')

    def download_image(self, url):
        try:
            response = requests.get(url, headers=self.headers)
            if response.status_code == 200:
                return response.content # 返回二进制数据
            return None
        except Exception as e:
            print('请求图片失败:', e, url)
            return None

    def save_image(self, url):
        content = self.download_image(url)
        if content:
            # os.getcwd()获取当前文件所在目录
            # md5(url).hexdisgest():获取url图片地址加密后的字符串
            with open('{}/{}.jpg'.format(os.getcwd(), md5(url.encode('utf-8')).hexdigest()), 'wb')as f:
                f.write(content)
        else:
            print('图片内容为空')

    def save_to_mongodb(self, data):
        #插入一条数据
        self.db['image'].insert_one(data)


jp = JiePaiSpider()


def get_all_data(offset):
    print('正在请求第{}页'.format(offset))
    json_str = jp.get_page_list(offset)
    if json_str:
        result = jp.parse_page_list(json_str)
        for url in result:
            html = jp.get_page_detail(url)
            if html:
                jp.parse_page_detail(html)


if __name__ == '__main__':

    pool = Pool()
    # map()映射方法
    # pool.map(函数,[])
    pool.map(get_all_data, [0, 20, 40, 60, 80, 100])