照片爬取BeautifulSoup

最新推荐文章于 2023-06-06 21:36:36 发布

m0_58662220

最新推荐文章于 2023-06-06 21:36:36 发布

阅读量124

点赞数

本文链接：https://blog.csdn.net/m0_58662220/article/details/122363251

版权

关键词由CSDN通过智能技术生成

#用到的技术点：
# 1.requests 发送请求，从服务器获取到数据
# 2.beautifulsoup 来解析整个页面的源代码
import requests
import json
import urllib
from bs4 import BeautifulSoup
#发送请求到服务器
resp = requests.get("https://www.umei.cc/meinvtupian/meinvxiezhen/")
resp.encoding = 'utf-8'#如果是乱码  这个时候找charset 这里里面写啥就是啥
#print(resp.text)
#解析html 找能点的超链接
main_page = BeautifulSoup(resp.text,"html.parser")
#从页面中找到某些东西   find()找一个   find——all()  找所有
#对于<li>..<li>每个对于一张图片，对于最上面的typelist显示的是整个根目录，可以定位到所有的图片
typelist = main_page.find("div",attrs={"class":"TypeList"})
#找到标签之后还要找带属性的
alst = typelist.find_all("a",attrs={"class":"TypeBigPics"})
n = 1
for a in alst:
 
    #print(a.get("href"))
    #发送请求到子页面，进入到有小姐姐的页面中
    href = "https://www.umei.cc" + a.get("href")
    resp1 = requests.get(href)
    resp1.encoding = "utf-8"
    child_page = BeautifulSoup(resp1.text,"html.parser")
    #找到图片的真实路径
    src = child_page.find("div",attrs={"class":"ImageBody"}).find("img").get("src")
    f = open("tu_%s.jpg" % n, mode="wb")#wb表示写入的文件非文本
    f.write(requests.get(src).content)#向外拿出图片的数据——不是文本信息
    n += 1
    #print("###")#现在找的是只是每个图的第一张，要需要找到每个里面的所有系列的图
    #找到这张图片以后也要找image_body,
    text = child_page.find("div",attrs={"class":"ImageBody"}).find_next("script").text#拿到script的文本
    num = text.split((","))[1].strip("\"")#找到"12"并干掉左右两端的引号
    print(num)#这里是找到相关的   如果想要全部就需要自己去遍历