Python爬虫五天总结

最新推荐文章于 2023-04-25 19:32:38 发布

垃圾桶里也挺好

最新推荐文章于 2023-04-25 19:32:38 发布

阅读量905

点赞数 1

文章标签： python 爬虫

本文链接：https://blog.csdn.net/maelee/article/details/122486768

版权

八.总结

1.完整项目练习

爬取壁纸分类中”全部“的前10页所有组图。

import requests
import re
import json
from lxml import etree
from concurrent.futures import ThreadPoolExecutor

#该函数负责获取到每一个详情页href
def get_detial_href(url):
    #准备域名
    string = "https://desk.zol.com.cn"
    resp = requests.get(url)
    resp.encoding = "gbk"
    et = etree.HTML(resp.text)
    hrefs = et.xpath("//ul[@class='pic-list2  clearfix']/li/a/@href")
    #处理一下href，添加域名
    new_hrefs = []
    for href in hrefs:
        if href=='https://file.cdn.cqttech.com/xzdesktop/XZDesktop_4020_2.0.12.22.exe':
            continue
        new_hrefs.append(string+href)
    #print(new_hrefs)
    return new_hrefs

#访问每一个详情页，得到每个详情页背后对应的一组图片下的路径
def get_img_src(href):
    resp = requests.get(href)
    resp.encoding = "gbk"
    obj = re.compile(r"var deskPicArr.*?=(?P<desk_str>.*?);",re.S)
    #提取页面中有用信息
    result = obj.search(resp.text).group("desk_str")
    # 把类似字典的字符串变成字典——使用json
    deskPic = json.loads(result)
    img_src_list = []
    for item in deskPic['list']:
        oriSize = item.get("oriSize")
        imgsrc = item.get("imgsrc")
        imgsrc = imgsrc.replace("##SIZE##", oriSize)
        # print(imgsrc)
        img_src_list.append(imgsrc)
    return img_src_list

#下载函数
def download_img(imgsrc):
    name = imgsrc.split("/")[-1]
    print(f"开始下载{name}")
    # 发送网络请求
    resp_img = requests.get(imgsrc)
    # 此时拿不到resp.text,利用resp.content拿到的是字节
    with open(f"picture/{name}", mode="wb") as f:
        f.write(resp_img.content)
    print(f"{name}下载完毕")

def main():
    for i in range(1,10):
        url = "https://desk.zol.com.cn/pc/"
        if i!=1:
            url = url+f"{i}.html"
        #抓取到首页中每个详情页的href
        print("抓取到首页中每个详情页的href......")
        hrefs = get_detial_href(url)
        #print(hrefs)
        print("访问每一个详情页，得到每一个详情页中的一组图片......")
        #储存所有图片下载地址
        img_list = []
        for href in hrefs:
            #访问每一个详情页，得到每一个详情页中的一组图片
            imgsrc_list = get_img_src(href)
            for img in imgsrc_list:
                img_list.append(img)
        #开始下载——创建线程池
        with ThreadPoolExecutor(20) as t:
            for img in img_list:
                t.submit(download_img,img)
        print("All over!!!")
        break

#主函数
if __name__ == '__main__':
    main()