爬虫页面
云健康网-热量查询
导入所需的库和模块
from gevent import monkey
import gevent, requests, bs4, openpyxl, time
from gevent. queue import Queue
from openpyxl import load_workbook, Workbook, worksheet
import re
import numpy as np
import pandas as pd
from urllib. request import urlopen
from urllib. request import urlretrieve
import json
食物热量&营养元素爬虫
firstpage_url = "http://www.hpcn21.com/food"
headers = { 'user-agent' : 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36' ,
'Cookie' : 'Hm_lvt_397c6669caf198f993b0bb244a74b53a=1657502938; Hm_lpvt_397c6669caf198f993b0bb244a74b53a=1657503671' }
firstpage = requests. get( firstpage_url, headers= headers)
bs_firstpage = bs4. BeautifulSoup( firstpage. text, 'html.parser' )
cate_list_url = [ ]
for div in bs_firstpage. find_all( 'div' , class_= 'cate' ) :
cate = div. find( 'a' ) [ 'href' ] . split( '/' ) [ - 1 ]
cate_list_url. append( firstpage_url + '/' + cate)
food_page_url = [ ]
for url in cate_list_url:
food_cate= requests. get( url, headers= headers)
bs_food_cate = bs4. BeautifulSoup( food_cate. text, 'html.parser' )
page_max_text = bs_food_cate. find( 'div' , class_= 'pagelink' ) . find_all( 'a' ) [ - 1 ]
page_max = int ( re. findall( "\d+" , str ( page_max_text) ) [ 0 ] )
sleep_time = np. random. randint( low= 1 , high= 5 )
time. sleep( sleep_time)
food_page_url = food_page_url + [ url + '/p' + str ( x) + '.html' for x in range ( 1 , page_max + 1 ) ]
爬取食物热量并下载食物图片
all_food_dict = { }
for url in food_page_url:
foodlist = requests. get( url, headers= headers)
bs_foodlist= bs4. BeautifulSoup( foodlist. text, 'html.parser' )
food_cate = bs_foodlist. find( 'h1' ) . text. split( ':' ) [ 0 ]
for food in bs_foodlist. find_all( 'div' , class_= "c fw_box_tt" ) :
foodinfo_dict = { }
foodinfo_dict[ 'foodcate' ] = food_cate
food_id = food. find_all( 'p' ) [ 0 ] . find( 'a' ) [ 'href' ] . split( '/' ) [ - 1 ]
foodinfo_dict[ 'id' ] = food_id
foodinfo_dict[ 'name' ] = food. find_all( 'p' ) [ 0 ] . find( 'a' ) [ 'title' ]
img_url = food. find_all( 'p' ) [ 0 ] . find( 'img' ) [ 'src' ]
foodinfo_dict[ 'img' ] = food. find_all( 'p' ) [ 0 ] . find( 'img' ) [ 'src' ]
foodinfo_dict[ 'alias' ] = food. find_all( 'p' ) [ 1 ] . text
foodinfo_dict[ 'heat_cal' ] = food. find_all( 'p' ) [ 2 ] . text
all_food_dict[ food_id] = foodinfo_dict
数据处理为dataframe格式
df = pd. DataFrame( all_food_dict. values( ) )
df[ 'alias' ] = df[ 'alias' ] . apply ( lambda x : x. split( ':' ) [ 1 ] )
df[ 'heat_cal' ] = df[ 'heat_cal' ] . apply ( lambda x: re. search( "\d+(\.\d+)?" , x) . group( ) ) . astype( 'float' )
食物营养元素明细爬虫
food_detail_dict = { }
for food_id in df[ 'id' ] . values:
food_url = 'http://www.hpcn21.com/shiwu/' + food_id
food_detail = requests. get( food_url, headers= headers)
bs_food_detail= bs4. BeautifulSoup( food_detail. text, 'html.parser' )
detail_name = bs_food_detail. find_all( 'span' , class_= 'dt' )
detail_num = bs_food_detail. find_all( 'span' , class_= 'dd' )
food_dt_dict = { }
for name, num in zip ( detail_name, detail_num[ 1 : ] ) :
food_dt_dict[ name. text] = num. text
food_detail_dict[ food_id] = food_dt_dict
if ( len ( food_detail_dict. keys( ) ) % 100 ) == 0 :
print ( '已爬取{}条信息' . format ( len ( food_detail_dict. keys( ) ) ) )
sleep_time = np. random. randint( low= 1 , high= 5 )
time. sleep( sleep_time)
food_detail_df = pd. DataFrame( food_detail_dict. values( ) )
food_detail_df[ 'id' ] = pd. DataFrame( food_detail_dict. keys( ) )
final_food_detail = df. merge( food_detail_df, how= 'left' , on= 'id' )
食谱数据爬虫
caipu_url = 'http://www.hpcn21.com/caipu/2'
res = requests. get( caipu_url, headers= headers)
bs_res= bs4. BeautifulSoup( res. text, 'html.parser' )
caipu_url_list = [ ]
for href in bs_res. find( 'div' , class_= 'caipu' ) . find_all( 'a' ) :
caipu_url_list. append( 'http://www.hpcn21.com' + href[ 'href' ] )
caipu_page_url = [ ]
for url in caipu_url_list:
caipu= requests. get( url, headers= headers)
bs_caipu = bs4. BeautifulSoup( caipu. text, 'html.parser' )
last_page = bs_caipu. find( 'div' , class_= 'pagelink' ) . find_all( 'a' )
if len ( last_page) > 0 :
page_max_text = bs_caipu. find( 'div' , class_= 'pagelink' ) . find_all( 'a' ) [ - 1 ]
page_max = int ( re. findall( "\d+" , str ( page_max_text[ 'href' ] . split( '/' ) [ 3 ] ) ) [ 0 ] )
caipu_page_url = caipu_page_url + [ url + '/p' + str ( x) + '.html' for x in range ( 1 , page_max + 1 ) ]
else :
caipu_page_url = caipu_page_url + [ url]
all_caipu_dict = { }
for url in caipu_page_url:
caipulist = requests. get( url, headers= headers)
bs_caipulist= bs4. BeautifulSoup( caipulist. text, 'html.parser' )
caipu_cate = bs_caipulist. find( 'h1' ) . text
for food in bs_caipulist. find_all( 'div' , class_= "c" ) :
foodinfo_dict = { }
foodinfo_dict[ 'caixi' ] = caipu_cate
food_id = food. find_all( 'p' ) [ 0 ] . find( 'a' ) [ 'href' ] . split( '/' ) [ - 1 ]
foodinfo_dict[ 'id' ] = food_id
foodinfo_dict[ 'name' ] = food. find_all( 'p' ) [ 0 ] . find( 'a' ) [ 'title' ]
img_url = food. find_all( 'p' ) [ 0 ] . find( 'img' ) [ 'src' ]
foodinfo_dict[ 'img' ] = food. find_all( 'p' ) [ 0 ] . find( 'img' ) [ 'src' ]
foodinfo_dict[ 'type' ] = food. find_all( 'p' ) [ 1 ] . text
foodinfo_dict[ 'heat_cal' ] = food. find_all( 'p' ) [ 2 ] . text
all_caipu_dict[ food_id] = foodinfo_dict
df_caipu = pd. DataFrame( all_caipu_dict. values( ) )
df_caipu. head( )