python爬虫笔记一:爬取豆瓣中指定的明星所有图片

从这个网址学习的:https://www.bilibili.com/read/cv10367703/

------------------------------------------------------------------------

https://movie.douban.com/celebrity/1011562/photos/

打开页面F12进入开发者工具,查看 下载的页数,以及每页最多显示30张

a_list=content.find_all('div',attrs={'class','cover'}) #获取网页中的所有a标签对象
picture_list = []
for d in a_list:
    plist=d.find('img')['src']
    picture_list.append(plist)

获取(共348张)并根据正规则获取纯数字:348

clist = content.find('span', attrs={'class', 'count'})  # 获取
ret= re.findall(r'\d+', clist.get_text())

写得太痛苦,直接放完整代码,自行看看理解就是了:

import re
import time
import requests
import os
from bs4 import BeautifulSoup
import lxml

#requests.getj时一定要加headers,否则会获取为空,只需要保留'User-Agent'一项即可
headers={
    # 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    # 'Accept-Language':'zh-CN,zh;q=0.9,en;q=0.8',
    # 'Cache-Control':'max-age=0',
    # 'Connection':'keep-alive',
    # 'Cookie':'ll="118254"; bid=bzf7LGz3pZA; _vwo_uuid_v2=DB12523A0B0C7127645E914A1FB363352|3d83981785084d997d7462a2ce24a947; __utmz=223695111.1626234491.2.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; douban-fav-remind=1; __utmz=30149280.1629095213.3.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1629168071%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3D71zldwjBiMBa-xfexgVZ43eTQq2n8KKtTWTsWh37m72e_lfEOE1x3NuDj6egeYBLyqGE4gjSJnbxueQLcYZWsq%26wd%3D%26eqid%3Ddb6736ec000219350000000660ee5e6f%22%5D; _pk_ses.100001.4cf6=*; ap_v=0,6.0; __utma=30149280.1335619985.1616046306.1629095213.1629168072.4; __utmc=30149280; __utmb=30149280.1.10.1629168072; __utma=223695111.444014824.1616046306.1626234491.1629168075.3; __utmb=223695111.0.10.1629168075; __utmc=223695111; _pk_id.100001.4cf6=fa72408676bee41c.1616046306.3.1629168230.1626234491.',
    # 'Host':'movie.douban.com',
    # 'sec-ch-ua':'" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
    # 'sec-ch-ua-mobile':'?0',
    # 'Sec-Fetch-Dest':'document',
    # 'Sec-Fetch-Mode':'navigate',
    # 'Sec-Fetch-Site':'none',
    # 'Sec-Fetch-User':'?1',
    # 'Upgrade-Insecure-Requests':'1',
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36',
}
# 设置post的数据
data = {}

def get_poster_url(res):
    content=BeautifulSoup(res.text,'lxml') #将网页源码构造成BeautifulSoup对象,方便操作
    #content=BeautifulSoup(res.text,'html.parser') #将网页源码构造成BeautifulSoup对象,方便操作
    a_list=content.find_all('div',attrs={'class','cover'}) #获取网页中的所有a标签对象
    picture_list = []
    for d in a_list:
        plist=d.find('img')['src']
        picture_list.append(plist)
    return picture_list;

def getCount(id,session):
    url = 'https://movie.douban.com/celebrity/{0}/photos/'.format(id)
    #res = requests.get(url=url, headers=headers)
    res = session.get(url=url,data=data, headers=headers)
    content = BeautifulSoup(res.text, 'lxml')
    #clists = content.find_all('span', attrs={'class', 'count'})  # 获取
    #[span.get_text() for span in clists]
    clist = content.find('span', attrs={'class', 'count'})  # 获取
    ret= re.findall(r'\d+', clist.get_text())
    if len(ret)>0:
        return [res,int(ret[0])]
    else:
        return [res,0]

def fire(mc,id,session):
    res,pagenums=getCount(id,session)
    if pagenums==0:
        return

    page=0
    for i in range(0,pagenums,30):
        #print("\n开始爬取山口百惠{}页\n".format(page))
        #url='https://movie.douban.com/celebrity/1014823/photos/?type=C&start={0}&sortby=like&size=a&subtype=a'.format(i)
        #print("开始爬取刘涛{}页\n".format(page))
        #url='https://movie.douban.com/celebrity/1011562/photos/?type=C&start={0}&sortby=like&size=a&subtype=a'.format(i)
        print("\n开始爬取{0}{1}页:{2}~{3}张/共{4}张\n".format(mc,page+1,page*30+1,(page+1)*30,pagenums))
        url='https://movie.douban.com/celebrity/{0}/photos/?type=C&start={1}&sortby=like&size=a&subtype=a'.format(id,i)
        if i>0:
            res = session.post(url=url,data=data, headers=headers)
            #res=requests.get(url=url,headers=headers)
        piclist=get_poster_url(res)
        download_picture(piclist,session)
        page=page+1
        time.sleep(1)

def download_picture(pic_l,session):
    if not os.path.exists(r'picture'):
        os.mkdir(r'picture')
    for i in pic_l:
        print("\r开始下载图片{0}".format(i))
        #pic=requests.get(i)
        pic=session.get(i)
        p_name=i.split('/')[7]
        with open('picture\\'+p_name,'wb') as f:
            f.write(pic.content)


mxarr=[('山口百惠','1014823'),('刘涛','1011562')]
if __name__ == '__main__':
    # 创建一个session,Session是requests库中的一个类,创建session对象进行访问的好处是,session对象能够自动维护访问的cookies信息。当然,它是不具备执行javascript代码的能力的,因此通过javascript修改的cookies信息它是记录不到的。
    session = requests.Session()
    for i,k in mxarr:
        fire(i,k,session)

  • 0
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
你好!对于爬取菜谱网站,你可以使用 Python 的 requests 和 BeautifulSoup 库来实现。首先,你需要发送请求获取网页的 HTML 内容,然后使用 BeautifulSoup 解析该内容,提取出你所需的菜谱信息。 以下是一个简单的示例代码,以爬取美食天下网站(www.meishij.net)为例: ```python import requests from bs4 import BeautifulSoup def get_recipe(url): response = requests.get(url) soup = BeautifulSoup(response.text, 'html.parser') # 在这里根据网页结构提取菜谱信息 # 例如,获取菜名和食材 recipe_name = soup.find('h1', class_='recipe-title').text.strip() ingredients = soup.find('div', class_='particulars').ul.find_all('li') ingredient_list = [] for ingredient in ingredients: ingredient_list.append(ingredient.text.strip()) # 输出菜谱名称和食材列表 print("菜谱名称:", recipe_name) print("食材列表:", ingredient_list) # 菜谱详情页的URL recipe_url = 'https://www.meishij.net/1' get_recipe(recipe_url) ``` 对于爬取豆瓣 Top250 电影榜单,同样可以使用 requests 和 BeautifulSoup 库来实现。以下是一个简单的示例代码: ```python import requests from bs4 import BeautifulSoup def get_top250(): url = 'https://movie.douban.com/top250' response = requests.get(url) soup = BeautifulSoup(response.text, 'html.parser') movie_list = soup.find('ol', class_='grid_view').find_all('li') for movie in movie_list: # 获取电影排名、名称和评分 rank = movie.find('em').text title = movie.find('span', class_='title').text rating = movie.find('span', class_='rating_num').text # 输出电影信息 print("排名:", rank) print("名称:", title) print("评分:", rating) print() get_top250() ``` 以上代码只是一个简单示例,你可以根据具体的网页结构和需求进行相应的修改和扩展。希望能对你有所帮助!

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值