python3 爬取 飞G图girl13.com 图片

python3 爬取 飞G图girl13.com 图片

简介:爬取 http://www.girl13.com 图片
self.time = 2 # 设置间隔时间,默认时间为2s,以防止封IP

第一个版本

import os
import time
import requests
import threading
from bs4 import BeautifulSoup


class Girl13(object):
    def __init__(self):
        self.session = requests.session()
        self.headers = {
            "Connection": "keep-alive",
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64)"
                          " AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.10 Safari/537.36"
        }
        self.time = 2  # 设置间隔时间

    # 获取状态
    def get_status(self, url):
        response = self.session.get(url, headers=self.headers)
        if response.status_code == 200:
            return response
        else:
            print("ERROR: 网络连接失败!")
            return False

    # 首页,建立连接
    def get_index(self, url):
        response = self.get_status(url)
        if response:
            # response.encoding = "utf-8"
            # html = response.text
            # print(html)
            print("首页,建立连接...")
            return True
        else:
            print("ERROR: 首页访问失败!")
            return False

    # 解析
    def parse_html(self, url):
        title_url = {}
        response = self.get_status(url)
        if not response:
            return None
        html = BeautifulSoup(response.text, "html5lib")
        columns = html.select("#loop-square .column-post")
        for column in columns:
            title = column.select(".entry-title")[0].text if column.select(".entry-title") else None
            img_url = column.select(".entry-content.cf img")[0].get("src") \
                if column.select(".entry-content.cf img") else None
            # print(title, img_url)

            if not title:
                continue
            title = os.path.basename(img_url)
            title_url[title] = img_url
        return title_url

    # 获取最后一页
    def get_last_page(self, url):
        response = self.get_status(url)
        if not response:
            return None
        html = BeautifulSoup(response.text, "html5lib")
        pages = html.select(".page-navigator li > a")
        if pages[-1].text == "下一页":
            last_page = pages[-2].text
        else:
            last_page = pages[-2].text
        return int(last_page)

    # 翻页
    @staticmethod
    def next_page(last_page):
        for i in range(1, last_page + 1):
            # url = "https://www.mzitu.com/zipai/comment-page-376"
            url = "http://www.girl13.com/page/{}".format(i)
            # print(url)
            yield url

    # 下载
    def download(self, path, url):
        print(url)
        with open(path, "wb") as f:
            response = self.get_status(url)
            content = response.content
            f.write(content)

    def main_(self):
        # 首页,建立连接
        url = "http://www.girl13.com"
        if not self.get_index(url):
            return None

        # 获取最后一页
        url = "http://www.girl13.com/page/1"
        last_page = self.get_last_page(url)
        if not last_page:
            return None

        path = os.path.abspath(os.path.join(os.getcwd(), "image"))
        if not os.path.exists(path):
            os.mkdir(path)

        # 翻页
        urls = self.next_page(last_page)
        for url in urls:
            title_url = self.parse_html(url)
            thread_list = []
            for title in title_url:
                path = os.path.abspath(os.path.join(os.getcwd(), "image", title))
                url = title_url[title]

                t = threading.Thread(target=self.download, args=(path, url))
                thread_list.append(t)

            for t in thread_list:
                t.start()
            for t in thread_list:
                t.join()

            time.sleep(self.time)

    def main(self):
        t = threading.Thread(target=self.main_)
        t.daemon = True
        t.start()
        t.join()


if __name__ == '__main__':
    girl = Girl13()
    girl.main()

第二个版本

修复第一个版本因意外原因导致退出的问题

# -*- coding: utf-8 -*-
import os
import time
import requests
import threading
from bs4 import BeautifulSoup


class Girl13(object):
    def __init__(self):
        self.session = requests.session()
        self.headers = {
            "Connection": "keep-alive",
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64)"
                          " AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.10 Safari/537.36"
        }
        self.time = 2  # 设置间隔时间

    def get_status(self, url):
        """
        获取状态
        :param url: 访问地址
        :return: 如果状态码200返回response,否则返回False
        """
        try:
            response = self.session.get(url, headers=self.headers)
            if response.status_code == 200:
                return response
            else:
                print("ERROR: 网络连接失败!")
                return False
        except requests.exceptions.ConnectionError:
            print("由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败")

    def get_index(self, url):
        """
        首页,建立连接
        :param url: 首页地址,建立 session 会话链接
        :return: 首页可访问,返回True,否则返回False
        """
        response = self.get_status(url)
        if response:
            # response.encoding = "utf-8"
            # html = response.text
            # print(html)
            print("首页,建立连接...")
            return True
        else:
            print("ERROR: 首页访问失败!")
            return False

    def parse_html(self, url):
        """
        解析网页,获取图片地址和图片名
        :param url: 当前网页地址
        :return: 返回一个字典
        """
        title_url = {}
        response = self.get_status(url)
        if not response:
            return None
        html = BeautifulSoup(response.text, "html5lib")
        columns = html.select("#loop-square .column-post")

        for column in columns:
            title = column.select(".entry-title")[0].text if column.select(".entry-title") else None
            img_url = column.select(".entry-content.cf img")[0].get("src") \
                if column.select(".entry-content.cf img") else None
            # print(title, img_url)

            if not title:
                continue
            try:
                title = os.path.basename(img_url)
                title_url[title] = img_url
            except TypeError:
                print("ERROR:", img_url)
        return title_url

    def get_last_page(self, url):
        """
        获取最后一页的页数,用于分析出最后一页的页码
        :param url: 第一页的url
        :return: 返回 int 型页数
        """
        response = self.get_status(url)
        if not response:
            return None
        html = BeautifulSoup(response.text, "html5lib")
        pages = html.select(".page-navigator li > a")
        if pages[-1].text == "下一页":
            last_page = pages[-2].text
        else:
            last_page = pages[-2].text
        return int(last_page)

    @staticmethod
    def next_page(last_page):
        """
        进行翻页
        :param last_page: 传入最后一页的页数
        :return: yield 生成器
        """
        for i in range(1, last_page + 1):
            # url = "https://www.mzitu.com/zipai/comment-page-376"
            url = "http://www.girl13.com/page/{}".format(i)
            # print(url)
            yield url

    def download(self, path, url):
        """
        下载图片
        :param path: 保存地址
        :param url: 图片url
        :return:
        """
        # print(url)
        with open(path, "wb") as f:
            response = self.get_status(url)
            content = response.content
            f.write(content)

    def main_(self):
        # 首页,建立连接
        url = "http://www.girl13.com"
        if not self.get_index(url):
            return None

        # 获取最后一页
        url = "http://www.girl13.com/page/1"
        last_page = self.get_last_page(url)
        if not last_page:
            return None

        path = os.path.abspath(os.path.join(os.getcwd(), "image"))
        if not os.path.exists(path):
            os.mkdir(path)

        # 翻页
        urls = self.next_page(last_page)
        for url in urls:
            print(url)
            title_url = self.parse_html(url)
            thread_list = []
            for title in title_url:
                path = os.path.abspath(os.path.join(os.getcwd(), "image", title))
                url = title_url[title]

                t = threading.Thread(target=self.download, args=(path, url))
                thread_list.append(t)

            for t in thread_list:
                t.start()
            for t in thread_list:
                t.join()

            time.sleep(self.time)

    def main(self):
        t = threading.Thread(target=self.main_)
        t.daemon = True
        t.start()
        t.join()


if __name__ == '__main__':
    girl = Girl13()
    girl.main()


  • 2
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值