介绍:
从下厨房网站爬取热门菜谱清单,内含:菜名、原材料、详细烹饪流程的 URL。
思路一:先去爬取所有的最小父级标签<div class ="info pure-u">
,然后针对每一个父级标签,想办法提取里面的菜名、URL、食材。
思路二:分别提取所有的菜名、所有的 URL、所有的食材。然后让菜名、URL、食材一一对应起来(这并不复杂,第 0 个菜名,对应第 0 个 URL,对应第 0 组食材,按顺序走即可)
思路一:使用BeautifulSoup解析方式进行爬取
完整代码
import requests
from bs4 import BeautifulSoup
import pandas as pd
data=[]
def url_create(): # 观察共有20页菜单,根据page变换,得到所有网页网址
url='https://www.xiachufang.com/explore/?page={}'
list_url=[url.format(i) for i in range(1,21)]
return list_url
def url_parse(url):
# 为躲避反爬机制,伪装成浏览器的请求头
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
# 获取数据
resp= requests.get(url,headers=headers)
# 解析数据
bs = BeautifulSoup(resp.text, 'html.parser')
# 爬取最小父级标签<div class="info pure-u">
foods=bs.find_all('div',class_='info pure-u')
for j in foods:
name=j.find('a').text.strip()
materials = j.find('p', class_='ing ellipsis').text.strip()
url_1 = url+ j.find('a')['href']
dic = {'名称': name, '原材料': materials, '制作流程链接': url_1}
data.append(dic)
def run():
links = url_create()
for i in links:
url_parse(i)
data_1 = pd.DataFrame(data)
data_1.to_excel('data.xlsx', index=False)
if __name__ == '__main__':
run()
思路二:使用xpath解析方式进行爬取
变更函数代码
def url_parse(url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
resp = requests.get(url,headers=headers)
tree = etree.HTML(resp.text)
lst = tree.xpath('/html/body/div[4]/div/div/div[1]/div[1]/div/div[2]/div[1]/ul/li')
for i in lst:
name = i.xpath("./div/div/p[1]/a/text()")[0].strip()
materials="".join(i.xpath("./div/div/p[2]//text()")[1:-1]).strip()
url_1=url+i.xpath("./div/div/p[4]/a/@href")[0]
dic = {'菜名': name, '原材料': materials, '制作流程链接': url_1}
data.append(dic)