初试python爬虫（简单爬取站长之家第一页图片）

最新推荐文章于 2022-05-28 18:44:26 发布

流氓鳄霸

最新推荐文章于 2022-05-28 18:44:26 发布

阅读量298

点赞数

文章标签： python

本文链接：https://blog.csdn.net/weixin_56643238/article/details/123819695

版权

爬取站长之家第一页图片

爬虫 ————需要借助第三方库
requests   beautifulsoup4  html5lib
1.模拟浏览器发送请求 并且接收服务器的响应数据 requests
2.解析并抓取服务器响应的数据 beautifulsoup4（抓取数据） + html5lib(将接收的数据转换成HTML格式)
3.抓取对应的需求数据并且存放在对应的文件中

import os.path
import requests
from bs4 import BeautifulSoup


#访问站长之家第一页图片的地址
url="https://sc.chinaz.com/tupian/"
download="img/"
#创建对应的文件夹
if not os.path.exists("img"):
    os.mkdir(download)
#发送请求
response=requests.get(url)
#检查请求是否成功（检查状态码为200）
if response.status_code==200:
    #查看请求内容
    #以文本的方式查看网页信息
    # print(response.text)
    #当出现乱码时 获取服务器响应的编码
    # print(response.encoding)
    #数据的编码格式
    # print(response.apparent_encoding)
    #将服务器响应编码改成数据格式编码
    response.encoding=response.apparent_encoding
    #以文本的方式查看网页信息（或出现乱码问题--编码格式不对应）
    # print(response.text)
    #在不考虑编码的情况下，可以使用二进制的方式(图片，音频，视频等资源)
    # print(response.content)
    #使用beautifulsoup去解析并且抓取数据
    bs=BeautifulSoup(response.content,"html5lib")
    # print(bs)
    #抓取想要的数据

    # 1.需要获取到图片的地址
    # 2.通过requests去发送请求获取图片
    # 3.response.content
    # 4.将图片保存到本地
    # file=open("a.jpg","wb")
    # file.write(response.content)
    # file.close()
    #获取存放图片的模块
    con=bs.select_one("#container")
    # print(con)
    #获取图片模块中的所有img标签
    imgs=con.select("img")
    # print(imgs)
    #获取最终的有效图片标签 返回一个list
    img_list=imgs[0::2]
    # print(img_list)
    #循环遍历img列表，获取src和alt数据
    for img in img_list:
        src=img["src2"]
        alt=img["alt"]
        print(src,alt)
        #将数据保存到本地
        with open(download+alt+".jpg","wb") as file:
            # res=requests.get(src)
            # file.write(res.content)
            file.write(requests.get("https:"+src).content)