Python爬虫食物热量及营养元素(详细代码)

爬虫页面

云健康网-热量查询

导入所需的库和模块
from gevent import monkey
import gevent,requests,bs4,openpyxl,time
from gevent.queue import Queue
from openpyxl import load_workbook,Workbook,worksheet
import re
import numpy as np
import pandas as pd

from urllib.request import urlopen
from urllib.request import urlretrieve

import json
食物热量&营养元素爬虫
firstpage_url = "http://www.hpcn21.com/food"

headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36',
          'Cookie': 'Hm_lvt_397c6669caf198f993b0bb244a74b53a=1657502938; Hm_lpvt_397c6669caf198f993b0bb244a74b53a=1657503671'}

#获取首页食物分类,生成分类页面网址
firstpage = requests.get(firstpage_url, headers=headers)
bs_firstpage = bs4.BeautifulSoup(firstpage.text, 'html.parser')
cate_list_url = []
for div in bs_firstpage.find_all('div',class_='cate'):  #查找分类标签
    cate = div.find('a')['href'].split('/')[-1]  #提取分类
    cate_list_url.append(firstpage_url + '/'+ cate) #构造网址

#逐个分类网址获取该分类下共有多少个page,生成每个page的网址

food_page_url = []
for url in cate_list_url:
    food_cate= requests.get(url, headers=headers) #打开每一个分类的首页网址
    bs_food_cate = bs4.BeautifulSoup(food_cate.text, 'html.parser') #
    page_max_text = bs_food_cate.find('div',class_='pagelink').find_all('a')[-1] #获取尾页的页面链接
    page_max = int(re.findall("\d+",str(page_max_text))[0])  #提取页面最大值
    sleep_time = np.random.randint(low=1,high=5)
    time.sleep(sleep_time) #随机等待1~5秒
    food_page_url = food_page_url + [url + '/p' + str(x) + '.html'  for x in range(1,page_max + 1)] #生成所有页面的网址
爬取食物热量并下载食物图片
all_food_dict = {} #存储食物信息的空字典
for url in food_page_url: #遍历所有的食物page
#     print('正在爬取:'+ url)
    foodlist = requests.get(url, headers=headers) #打开每一个分类的首页网址
    bs_foodlist= bs4.BeautifulSoup(foodlist.text, 'html.parser') #
    food_cate = bs_foodlist.find('h1').text.split(':')[0]  #获取食物大类
    for food in bs_foodlist.find_all('div',class_="c fw_box_tt"): #遍历食物信息list,每个list 10个food
        foodinfo_dict = {} #存储单个食物信息的字典
        foodinfo_dict['foodcate'] = food_cate
        food_id = food.find_all('p')[0].find('a')['href'].split('/')[-1]  #食物id
        foodinfo_dict['id'] = food_id
        foodinfo_dict['name'] = food.find_all('p')[0].find('a')['title']
        img_url = food.find_all('p')[0].find('img')['src']
        foodinfo_dict['img'] = food.find_all('p')[0].find('img')['src']
#         path = 'D:\\Python\\Jupyter\\6 爬虫\\01食物热量爬虫\\图片库\\' + img_url.split('/')[-1]
#         urlretrieve(img_url,path) #下载图片到指定位置
        foodinfo_dict['alias'] = food.find_all('p')[1].text #食物别名
        foodinfo_dict['heat_cal'] = food.find_all('p')[2].text #食物热量
        all_food_dict[food_id] = foodinfo_dict
#     print('已爬取{}条信息'.format(len(all_food_dict.keys())))
#     sleep_time = np.random.randint(low=1,high=5)
#     time.sleep(sleep_time) #随机等待5-10秒
数据处理为dataframe格式
#数据处理
df = pd.DataFrame(all_food_dict.values())
df['alias'] = df['alias'].apply(lambda x : x.split(':')[1])
df['heat_cal'] = df['heat_cal'].apply(lambda x:re.search("\d+(\.\d+)?",x).group()).astype('float') #正则匹配食物热量
食物营养元素明细爬虫
#食物详细营养元素爬虫
food_detail_dict = {}
for food_id  in df['id'].values:
    food_url = 'http://www.hpcn21.com/shiwu/' + food_id
    food_detail = requests.get(food_url, headers=headers) #打开每一个分类的首页网址
    bs_food_detail= bs4.BeautifulSoup(food_detail.text, 'html.parser') #
    detail_name = bs_food_detail.find_all('span',class_='dt')
    detail_num = bs_food_detail.find_all('span',class_='dd')
    food_dt_dict = {}
    for name,num in zip(detail_name,detail_num[1:]):
        food_dt_dict[name.text] = num.text
    food_detail_dict[food_id] = food_dt_dict
    if (len(food_detail_dict.keys()) % 100) == 0:  #每100条打印一次进度
        print('已爬取{}条信息'.format(len(food_detail_dict.keys())))
    sleep_time = np.random.randint(low=1,high=5)
    time.sleep(sleep_time) #随机等待5-10秒
 #数据合并 
food_detail_df = pd.DataFrame(food_detail_dict.values())
food_detail_df['id'] = pd.DataFrame(food_detail_dict.keys())
final_food_detail = df.merge(food_detail_df,how='left',on='id') #合并数据
食谱数据爬虫
##爬取菜谱列表
caipu_url = 'http://www.hpcn21.com/caipu/2'
res = requests.get(caipu_url,headers=headers) 
bs_res= bs4.BeautifulSoup(res.text, 'html.parser')

caipu_url_list = []
for href in bs_res.find('div',class_='caipu').find_all('a'):  #菜系列表
    caipu_url_list.append('http://www.hpcn21.com' + href['href'])

caipu_page_url = []
for url in caipu_url_list:
    caipu= requests.get(url, headers=headers) #打开每一个分类的首页网址
    bs_caipu = bs4.BeautifulSoup(caipu.text, 'html.parser') #
    last_page = bs_caipu.find('div',class_='pagelink').find_all('a') #获取所有页面链接
    if len(last_page) > 0: #个别菜系只有一页,获取不到尾页链接
        page_max_text = bs_caipu.find('div',class_='pagelink').find_all('a')[-1] #获取尾页的页面链接
        page_max = int(re.findall("\d+",str(page_max_text['href'].split('/')[3]))[0])   #提取页面最大值
        caipu_page_url = caipu_page_url + [url + '/p' + str(x) + '.html'  for x in range(1,page_max + 1)] #生成所有页面的网址
    else:
        caipu_page_url = caipu_page_url + [url]

all_caipu_dict = {} #存储食物信息的空字典
for url in caipu_page_url: #遍历所有的食物page
    caipulist = requests.get(url, headers=headers) #打开每一个分类的首页网址
    bs_caipulist= bs4.BeautifulSoup(caipulist.text, 'html.parser') #
    caipu_cate = bs_caipulist.find('h1').text#获取食物大类
    for food in bs_caipulist.find_all('div',class_="c"):
        foodinfo_dict = {} #存储单个食物信息的字典
        foodinfo_dict['caixi'] = caipu_cate
        food_id = food.find_all('p')[0].find('a')['href'].split('/')[-1]  #食物id
        foodinfo_dict['id'] = food_id
        foodinfo_dict['name'] = food.find_all('p')[0].find('a')['title']
        img_url = food.find_all('p')[0].find('img')['src']
        foodinfo_dict['img'] = food.find_all('p')[0].find('img')['src']
#         path = 'D:\\Python\\Jupyter\\6 爬虫\\01食物热量爬虫\\食谱图片库\\' + img_url.split('/')[-1]
#         urlretrieve(img_url,path) #下载图片到指定位置
        foodinfo_dict['type'] = food.find_all('p')[1].text #食物别名
        foodinfo_dict['heat_cal'] = food.find_all('p')[2].text #食物热量
        all_caipu_dict[food_id] = foodinfo_dict
#     print('已爬取{}条信息'.format(len(all_food_dict.keys())))
#     sleep_time = np.random.randint(low=1,high=5)
#     time.sleep(sleep_time) #随机等待5-10秒

df_caipu = pd.DataFrame(all_caipu_dict.values())

df_caipu.head()
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值