Python——爬虫练习，爬取小姐姐

最新推荐文章于 2024-05-11 11:28:36 发布

重口味的伪程序员

最新推荐文章于 2024-05-11 11:28:36 发布

阅读量6.9k

点赞数 11

分类专栏：牌神Python

本文链接：https://blog.csdn.net/weixin_43087443/article/details/88079081

版权

牌神Python 专栏收录该内容

10 篇文章 0 订阅

订阅专栏

自从学了python，我就发现程序员是不缺对象的——“python里的所有数据都是以对象的像是存在的，无论是简单的数字类型还是复杂的代码模块”，233
目标网站：url=‘http://www.win4000.com/meinvtag4_1.html’

在这里插入图片描述
根本把持不住呀！

瞎几把分析一下：

在这里插入图片描述

一级页面找到了，F12—copy xpath，行云流水

在这里插入图片描述

/html/body/div[4]/div/div[3]/div[1]/div[1]/div[2]/div/div/ul/li[1]/a

同样方法获取到二级页面的.jpg地址

在这里插入图片描述

开写！

import requests as re
from lxml import etree
import os,pprint
down_dir = os.path.join(os.getcwd(), 'beautiful_girls/')
#all_down_url={}# 格式：{性感撩人美女私房大胆诱惑图片：[下载地址1，下载地址2]}
jpg_title=[]
jpg_url=[]
def get_url_one(url):
    all_url_one=[]
    req=re.get(url)
      
    for i in range(1,25):#共24页
        
        xp='/html/body/div[4]/div/div[3]/div[1]/div[1]/div[2]/div/div/ul/li['+str(i)+']/a'
        ele=etree.HTML(req.text).xpath(xp)
        #print(ele[0].attrib['href'])
        all_url_one.append(ele[0].attrib['href'])
    return all_url_one#列表
   
def get_url_two_page(ls):#传入一个列表
    for i in range(len(ls)):
    
        #/html/body/div[4]/div/div[2]/div/div[1]/div[1]/h1  #标题
    
        req=re.get(ls[i])
        xp_titie='/html/body/div[4]/div/div[2]/div/div[1]/div[1]/h1'
        ele_title=etree.HTML(req.text).xpath(xp_titie)
        title=str(ele_title[0].text)
        global jpg_title
        jpg_title.append(title)       
        
        xp_page='/html/body/div[4]/div/div[2]/div/div[1]/div[1]/em'  #/html/body/div[4]/div/div[2]/div/div[1]/div[1]/em
        ele_page=etree.HTML(req.text).xpath(xp_page)#9
        #print(str(ele_page[0].text))     
        temp=[]
        for x in range(1,int(ele_page[0].text)+1):#1-9
            temp.append(ls[i][:(len(ls[i])-5)]+'_'+str(x)+'.html')
        ls[i]=temp    
    return ls
    #print(ls)
    
def get_url_two(url):#获取图片地址***.jpg
    
    req=re.get(url)
    xp='//*[@id="pic-meinv"]/a/img'
    ele=etree.HTML(req.text).xpath(xp)
    
    return ele[0].attrib['url']



def main():
    for i in range(1,6):
        website='http://www.win4000.com/meinvtag4_%s.html'% str(i)    
    
        url=get_url_one(website)
        #print(url)
        allurl=get_url_two_page(url)
        
        #pprint.pprint(allurl)
        for u1 in allurl:
            temp=[]
            for u2 in u1:
                print(u2)
                temp.append(get_url_two(u2))
        
        #print(temp)
            jpg_url.append(temp)
        #all_down_url=dict(zip(jpg_title,jpg_url))
        #pprint.pprint(all_down_url)
   
    down_and_save()
    



def down_and_save():#命名规则 字典key+编号.jpg
    for i in range(len(jpg_url)):
        for num in range(1,len(jpg_url[i])+1):
            file_name=jpg_title[i]+str(num)+'.jpg'
            jpg=jpg_url[i][num-1]
            print(file_name,jpg)
            res=re.get(jpg)
            with open(down_dir+file_name,'wb') as f:
                for chunk in res.iter_content(100000):
                    f.write(chunk)
                


if __name__=='__main__':
    if not os.path.exists(down_dir):
        os.mkdir(down_dir)
    print('图片保存在:'+down_dir)
    
    main()

终于写完了。。。代码拙劣啊。。。不过终于把1082张照片收入囊中。。。
在这里插入图片描述

重口味的伪程序员

关注

11
点赞
踩
68

收藏

觉得还不错? 一键收藏
7
评论
Python——爬虫练习，爬取小姐姐

自从学了python，我就发现程序员是不缺对象的——“python里的所有数据都是以对象的像是存在的，无论是简单的数字类型还是复杂的代码模块”，233目标网站：url=‘http://www.win4000.com/meinvtag4_1.html’根本把持不住呀！瞎几把分析一下：一级页面找到了，F12—copy xpath，行云流水/html/body/div[4]/div...
复制链接

扫一扫