Python爬虫练习:采集头条文章图片,并存储MangoDB数据

mango_config.py

#MangoDB 连接信息
MONGO_URL = 'localhost'
MONGO_DB = 'toutiao'
MONGO_TABLE = 'toutiao'

#采集列表的页数范围
GROUP_START = 1
GROUP_END = 20

#采集列表的搜索内容
KEYWORD = '美食'

app.py

import requests
import pymongo
import json
import re
import os
from hashlib import md5
from urllib.parse import urlencode
from requests.exceptions import RequestException
from mongo_config import *
from multiprocessing import Pool

#MangoDB初始化
client = pymongo.MongoClient(MONGO_URL)
mango_db = client[MONGO_DB]

#搜索列表页请求
def get_find_index(page=0,keyword="街拍"):
    data = {
        'aid':24,
        'app_name':'web_search',
        'offset':page,
        'format':'json',
        'keyword':keyword,
        'autoload':'true',
        'count':20,
        'en_qc':1,
        'cur_tab':1,
        'from':'search_tab',
        'pd':'synthesis',
        'timestamp':'1556807950562'
    }
    headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36','cookie':'tt_webid=6686435681956447751; WEATHER_CITY=%E5%8C%97%E4%BA%AC; UM_distinctid=16a78fa5ae74c8-012c5c6db69a84-6353160-1fa400-16a78fa5ae8380; CNZZDATA1259612802=1163063110-1556803453-%7C1556803453; __tasessionId=8pp8fiish1556807834386; tt_webid=6686435681956447751; csrftoken=fe46f2a51c61c8af81792a8cc3d368d9; s_v_web_id=3db23f6b1fb477d9e6f9b9a234e364b7'}

    try:
        q_data = requests.get('https://www.toutiao.com/api/search/content/?'+urlencode(data),headers=headers)
        if q_data.status_code == 200:
            return q_data.content
        return None
    except RequestException:
        print('请求索引错误')
        return None

#搜索列表URL提取
def parse_page_index(html):
    try:
        data_json = json.loads(html)
        if data_json and 'data' in data_json.keys():
            for item in data_json.get('data'):
                yield item.get('article_url')
    except TypeError:
        return None

#内容页HTML请求
def get_find_show(url):
    headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36','cookie':'tt_webid=6686435681956447751; WEATHER_CITY=%E5%8C%97%E4%BA%AC; UM_distinctid=16a78fa5ae74c8-012c5c6db69a84-6353160-1fa400-16a78fa5ae8380; CNZZDATA1259612802=1163063110-1556803453-%7C1556803453; __tasessionId=8pp8fiish1556807834386; tt_webid=6686435681956447751; csrftoken=fe46f2a51c61c8af81792a8cc3d368d9; s_v_web_id=3db23f6b1fb477d9e6f9b9a234e364b7'}
    try:
        q_data = requests.get(url,headers=headers)
        if q_data.status_code == 200:
            return q_data.text
        return None
    except RequestException:
        print('请求错误',url)
        return None

#内容页标题 + 图片URL列表 提取
def parse_page_show(html):
    try:
        title = re.search('<title>(.*?)</title><meta',str(html),re.S)
        img_list = re.findall('class&#x3D;&quot;pgc-img&quot;&gt;&lt;img src&#x3D;&quot;(.*?)&quot; img_width',str(html),re.S)
        title_text = str(title.group(1))
        img_list_url = img_list
        for img in img_list_url:download_img(img)
        return {
            'title':title_text,
            'img_list':img_list_url
        }
    except AttributeError:
        print('NoneType')
        return None

#MangoDB存储
def save_to_mango(result):
    if mango_db[MONGO_TABLE].insert_one(result):
        print('成功存储到MangoDB')
        return True
    return False

#图片链接请求
def download_img(url):
    print('下载',url)
    headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36','cookie':'tt_webid=6686435681956447751; WEATHER_CITY=%E5%8C%97%E4%BA%AC; UM_distinctid=16a78fa5ae74c8-012c5c6db69a84-6353160-1fa400-16a78fa5ae8380; CNZZDATA1259612802=1163063110-1556803453-%7C1556803453; __tasessionId=8pp8fiish1556807834386; tt_webid=6686435681956447751; csrftoken=fe46f2a51c61c8af81792a8cc3d368d9; s_v_web_id=3db23f6b1fb477d9e6f9b9a234e364b7'}
    try:
        data = requests.get(url,headers=headers)
        if data.status_code == 200:
            save_img(data.content)
        return None
    except RequestException:
        print('请求错误',url)
        return None

#图片下载
def save_img(content):
    file_path = '{0}/{1}.{2}'.format(os.getcwd()+'\\images\\',md5(content).hexdigest(),'jpg')
    if not os.path.exists(os.getcwd()+'\\images'):
        os.makedirs(os.getcwd()+'\\images')
    if not os.path.exists(file_path):
        with open(file_path,'wb') as f:
            f.write(content)

#控制器
def main(offset):
    html = get_find_index(offset,KEYWORD)
    for url in parse_page_index(html):
        if url:
            html_show = get_find_show(url)
            if html_show:
                result = parse_page_show(html_show)
                if result:save_to_mango(result)

#初始化
if __name__=='__main__':
    groups = [x * 20 for x in range(GROUP_START,GROUP_END+1)]
    pool = Pool()
    pool.map(main,groups)

总结:众所周知Bilibili是一个学习的网站!

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

rock__rabbit

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值