python爬虫绝对领域


前言

试着爬了一下绝对领域,学习测试用


代码如下(示例):


import requests
from bs4 import BeautifulSoup
import os
import json
import urllib
url = "https://www.jdlingyu.com/"
tagurl="https://www.jdlingyu.com/tuji/hentai/gctt/82096.html"




class JD():
    def __init__(self):
        self.url = "https://www.jdlingyu.com/"
        self.tagurl="https://www.jdlingyu.com/tuji/hentai/gctt/82096.html"
        self.header = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
        }
        self.post="?post_order=views"

   
    def get_all_tag(self):
        '''得到标签云中的内容'''
        response =  requests.get(self.tagurl, headers=self.header)
        res_html = response.content.decode("utf-8")
        allsoup=BeautifulSoup(res_html,"html.parser")
        tagcloud_soup=allsoup.find("div",attrs={"class":"tagcloud"})
        tagcloud=tagcloud_soup.find_all("a",attrs={"class":"tag-cloud-link"})
        tagdict={}
        for tag in tagcloud:
            #print(tag.text,tag.attrs["href"])
            tagdict[tag.text]=tag.attrs["href"]
        return tagdict
    def get_a_tag_url(self,url):
        '''得到一种标签云中的所有内容'''
        response =  requests.get(url+self.post, headers=self.header)
        res_html = response.content.decode("utf-8")
        allsoup=BeautifulSoup(res_html,"html.parser")
        gap_soup=allsoup.find("ul",attrs={"class":"b2_gap "})
        gaps=gap_soup.find_all("div",attrs={"class":"post-info"})
        gapdict={}
        for gap in gaps:
            print(gap.find("a").attrs["href"])
            gapdict[gap.find("a").text]=gap.find("a").attrs["href"]
        return gapdict    
    def tag_all_url(self):
        '''得到所有分类下的网址到json'''
        tagdict=self.get_all_tag()
        gapdict={}
        for tag,url in tagdict.items():
            gapdict[tag]=self.get_a_tag_url(url)    
        with open("tag_all_url.json", "w",encoding='utf-8') as f:
            f.write(json.dumps(gapdict, ensure_ascii=False, indent=4, separators=(',', ':')))
        return gapdict
    def load_json(self)-> dict:
        """json转字典"""
        with open("tag_all_url.json", encoding="utf-8") as f:
            json_file = json.load(f)
       
        return json_file
    def get_img(self,url):
        response =  requests.get(url, headers=self.header)
        res_html = response.content.decode("utf-8")
        res_html=BeautifulSoup(res_html,"html.parser")
        imglist=res_html.find("div",attrs={"class":"entry-content"}).find_all("img")
        for newurl in imglist:
            newurl=newurl["src"]
            urllib.request.urlretrieve(newurl,newurl.split("/")[-1])
        
        newurl=imglist[0]["src"]
        urllib.request.urlretrieve(newurl,newurl.split("/")[-1])
        
    def folder(self,listname):
        try:
            listname=re.sub('[\/:*?"<>|]',"_",listname)
        except:
            pass
        path=listname
        if os.path.exists(path): # 判断是否存在同名文件夹
            path = path + str(1)
        else:
            pass
        
        os.makedirs(path)
        return path
    
    def run(self):
        json_file=self.load_json()
        self.folder("img")
        os.chdir('img')
        for  k,v in json_file.items():
            try:
                k=re.sub('[\/:*?"<>|]',"_",k)
            except:
                pass
            print("正在下载大分类:"+k)
            self.folder(k)
            print("文件夹建立成功,进入文件夹中")
            os.chdir(k)
            for k2,v2 in v.items():
                k2=re.sub('[\/:*?"<>|]',"_",k2)
                print("正在下载:"+k2)
                self.folder(k2)
                print("文件夹建立成功,进入文件夹中")
                os.chdir(k2)
                try:
                    self.get_img(v2)
                    print("下载完毕,正在返回上一级")
                except:
                    print("下载失败,正在返回上一级")
                os.chdir("..")
            os.chdir("..")
               







if __name__ == '__main__':                                       
   
    JD().run()
    







  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值