爬图集保存到mongodb

最新推荐文章于 2024-07-21 02:59:10 发布

wokwn

最新推荐文章于 2024-07-21 02:59:10 发布

阅读量623

点赞数

分类专栏： python

本文链接：https://blog.csdn.net/wokwn/article/details/79899863

版权

python 专栏收录该内容

17 篇文章 0 订阅

订阅专栏

import requests
from urllib.parse import urlencode
from requests.exceptions import RequestException

import json
from bs4 import BeautifulSoup
import re



from config import *#引入config.py中所有变量
import pymongo
client=pymongo.MongoClient(MONGO_URL)
db=client[MONGO_DB]




def get_page_index(offset,keyword):
    data={
        'offset': offset,
        'format': 'json',
        'keyword': keyword,
        'autoload':'true',
        'count': '20',
        'cur_tab': '3',
        'from': 'gallery'
    }
    url='https://www.toutiao.com/search_content/?'+urlencode(data)#将字典类型自动转换为url请求参数

    try:
        response=requests.get(url)
        if response.status_code==200:
            return response.text
        return  None
    except RequestException:
        print('error')
        return None

def parse_page_index(html):
    data=json.loads(html)
    if data and 'data' in data.keys():
        for item in data.get('data'):
            yield item.get('article_url')

def get_page_detail(url):
    try:
        headers={'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'}#这里如果不加头部，后面#JSON.parse("{}")
        response=requests.get(url,headers=headers)
        if response.status_code==200:
            # print(response.text)
            return response.text
        return  None
    except RequestException:
        print('error')
        return None
def parse_page_detail(html,url):
    soup=BeautifulSoup(html,'lxml')
    title=soup.select('title')[0].get_text()
    # print(title)
    images_pattern=re.compile('JSON.parse.*?\((.*?)\)',re.S)
    result=re.search(images_pattern,html)
    # gallery: JSON.parse(
    #     "{\"count\":7,\"sub_images\":[{\"url\":\"http:\\/\\/p3.pstatp.com\\/origin\\/213a0000d62a02db7e89\",\"width\":690,
    if result:
        data=json.loads(result.group(1))#group(0)为匹配的整个字符
        data=json.loads(data)#********需转换两次？？
        if data and 'sub_images' in data.keys():
            sub_images=data['sub_images']
            images=[item['url'] for item in sub_images]
            for image in images:
                download_image(image)
            return {
                'title':title,
                'url':url,
                'images':images
            }

#存储到mongodb
def sava_to_mongo(result):
    if db[MONGO_TABLE].insert(result):
        print('存储到mongodb成功',result)
        return True
    return False

#下载图片
def download_image(url):
    print('正在下载',url)
    try:
        response=requests.get(url)
        if response.status_code==200:
            save_image(response.content)#content返回二进制结果，text返回内容
        return  None
    except RequestException:
        print('请求图片出错')
        return None
import os
from hashlib import md5
def save_image(content):
    file_path='{0}/{1}.{2}'.format(os.getcwd(),md5(content).hexdigest(),'jpg')#os.getcwd()当前路径
    if not os.path.exists(file_path):
        with open(file_path,'wb')as f:
            f.write(content)
            f.close()

def main(offset):
    html=get_page_index(offset,'街拍')
    for url in parse_page_index(html):
        url=re.sub('group.*?/','a',url) # 获得的网站'http://toutiao.com/group/6421370692050796802/')实际网站 https: // www.toutiao.com / a6421370692050796802
        html=get_page_detail(url)
        if html:
            result=parse_page_detail(html,url)
            sava_to_mongo(result)


    print(html.text)


from multiprocessing import Pool
if __name__=='__main__':
    # main()
    groups=[x*20 for x in range(GROUP_START,GROUP_END+1)]
    pool=Pool()
    pool.map(main,groups)

config.py

MONGO_URL='localhost'
MONGO_DB='toutiao'
MONGO_TABLE='toutiao'

GROUP_START=1
GROUP_END=20

wokwn

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

专栏目录