Python 爬取手机 豆果美食app存Mongodb

8 篇文章 0 订阅
2 篇文章 0 订阅

爬虫代码 

import requests
import time
import json
import queue
from handle_mongo import mongo_info
from handle_proxy import proxy
import random
# 导入线程池模块
from concurrent.futures import ThreadPoolExecutor
# 创建队列
qqq = queue.Queue()
# 获取代理
ip = proxy()

def handel_requests(url,data):
    headers = {
        "client": "4",
        "version": "6932.2",
        "device": "SM-G955N",
        "sdk": "19,4.4.2",
        "imei": "354730010722829",
        "channel": "qqkp",
        "mac": "72:1C:E7:AA:15:BD",
        "resolution": "1280*720",
        "dpi": "1.5",
        "android-id": "721ce7aa15bd3673",
        "pseudo-id": "7aa15bd3673721ce",
        "brand": "samsung",
        "scale": "1.5",
        "timezone": "28800",
        "language": "zh",
        "cns": "3",
        "carrier": "CMCC",
        # "imsi": "460077228231170",
        "user-agent": "Mozilla/5.0 (Linux; Android 4.4.2; SM-G955N Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/30.0.0.0 Mobile Safari/537.36",
        "reach": "1",
        "newbie": "0",
        # "lon": "104.569743",
        # "lat": "39.002595",
        # "cid": "152900",
        "Content-Type": "application/x-www-form-urlencoded; charset=utf-8",
        "Accept-Encoding": "gzip, deflate",
        "Connection": "Keep-Alive",
        # "Cookie": "duid=58851623",
        "Host": "api.douguo.net",
        # "Content-Length": "68"
    }
    # 代理
    p = random.choice(ip)
    global dl
    dl = {
        'http':'{}:{}'.format(p['ip'],p['port'])
    }
    html = requests.post(url=url,data=data,headers=headers,proxies=dl)
    return html.text

def handle_index():
    # 豆果 分类 url
    url = 'http://api.douguo.net/recipe/flatcatalogs'
    data = {
        "client": "4",
        "_session": "1550551890907354730010722829",
        "v": str(int(time.time())),
        "_vs": "2305",
    }
    response = handel_requests(url,data)
    index_response = json.loads(response)
    for i in index_response['result']['cs']:
        for x in i['cs']:
            for a in x['cs']:
                data2 = {
                    "client": "4",
                    # "_session": "1550551890907354730010722829",
                    "keyword": a['name'],
                    "order": "0",
                    "_vs": "400",
                }
                qqq.put(data2)


def handle_caipu_list(data):
    print('当前处理的食材:',data['keyword'])
    caipu_url = 'http://api.douguo.net/recipe/v2/search/0/20'
    caipu_response = handel_requests(url=caipu_url,data=data)
    json_caipu = json.loads(caipu_response)
    for _ in json_caipu['result']['list']:
        # 存放每个菜的信息
        caipu_info = {}
        caipu_info['shicai'] = data['keyword']
        if _['type'] == 13:
            caipu_info['user_name'] = _['r']['an'] # 用户名
            caipu_info['id'] = _['r']['id'] # 菜的 id
            caipu_info['describe'] = _['r']['cookstory'].replace('\n','').replace(' ','')# 描述
            caipu_info['caipu_name'] = _['r']['n'] # 菜名
            caipu_info['cailiao_list'] = _['r']['major'] # 材料
            # print(caipu_info)
            # 拼接 url
            detail_url = 'http://api.douguo.net/recipe/detail/{}'.format(caipu_info['id'])
            detail_data = {
                "client": '4',
                # "_session": '1550551890907354730010722829',
                "author_id": '0',
                "_vs": '2801',
                "_ext": '{"query":{"id":"'+str(caipu_info['id'])+'","kw":"'+caipu_info['shicai']+'","idx":"3","src":"2801","type":"13"}}',
            }
            # print(detail_url)
            # print(detail_data)
            info = handel_requests(url=detail_url,data=detail_data)
            # 转字典格式
            info_dict = json.loads(info)
            if info_dict['result']['recipe'].get('tags'):
                caipu_info['cailiao_list'] = info_dict['result']['recipe']['tags']
            caipu_info['cookstep'] = info_dict['result']['recipe']['cookstep']

            print('当前入库的菜谱是:',caipu_info['caipu_name'])
            print('使用的代理是:',dl)
            mongo_info.insert_item(caipu_info)

        else:
            continue

handle_index()

# 创建线程池
pool = ThreadPoolExecutor(30)
while qqq.qsize() > 0:
    pool.submit(handle_caipu_list,qqq.get())
    

数据库代码

import pymongo
from pymongo.collection import Collection


class Connect_mongo(Collection):
    def __init__(self):
        self.client = pymongo.MongoClient(host='127.0.0.1',port=27017)
        self.db_data = self.client['dou_guo_mei_shi']

    def insert_item(self,item):
        db_collection = Collection(self.db_data,'don_guo_mei_shi_item')
        db_collection.insert(item)

mongo_info = Connect_mongo()

代理代码

import requests
import json

def proxy():
    url = 'http://webapi.http.zhimacangku.com/getip?num=5&type=2&pro=&city=0&yys=0&port=1&pack=34365&ts=1&ys=1&cs=1&lb=1&sb=0&pb=45&mr=2&regions='
    h = requests.get(url).text
    html = json.loads(h)
    if html:
        return html['data']

 

爬虫(Web Crawler)是一种自动化程序,用于从互联网上收集信息。其主要功能是访问网页、提取数据并储,以便后续分析或展示。爬虫通常由搜索引擎、数据挖掘工具、监测系统等应用于网络数据抓取的场景。 爬虫的工作流程包括以下几个关键步骤: URL收集: 爬虫从一个或多个初始URL开始,递归或迭代地发现新的URL,构建一个URL队列。这些URL可以通过链接分析、站点地图、搜索引擎等方式获取。 请求网页: 爬虫使用HTTP或其他协议向目标URL发起请求,获取网页的HTML内容。这通常通过HTTP请求库实现,如Python中的Requests库。 解析内容: 爬虫对获取的HTML进行解析,提取有用的信息。常用的解析工具有正则表达式、XPath、Beautiful Soup等。这些工具帮助爬虫定位和提取目标数据,如文本、图片、链接等。 数据储: 爬虫将提取的数据储到数据库、文件或其他储介质中,以备后续分析或展示。常用的储形式包括关系型数据库、NoSQL数据库、JSON文件等。 遵守规则: 为避免对网站造成过大负担或触发反爬虫机制,爬虫需要遵守网站的robots.txt协议,限制访问频率和深度,并模拟人类访问行为,如设置User-Agent。 反爬虫应对: 由于爬虫的在,一些网站采取了反爬虫措施,如验证码、IP封锁等。爬虫工程师需要设计相应的策略来应对这些挑战。 爬虫在各个领域都有广泛的应用,包括搜索引擎索引、数据挖掘、价格监测、新闻聚合等。然而,使用爬虫需要遵守法律和伦理规范,尊重网站的使用政策,并确保对被访问网站的服务器负责。
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值