6.23

import time, re, requests, xlwt, urllib.request, os
from requests.exceptions import ConnectionError
from fake_useragent import UserAgent


# 建立代理池
def get_proxy():
    return requests.get('http://127.0.0.1:5010/get/').text


# 删除不可用的代理
def delete_proxy(proxy):
    requests.get('http://127.0.0.1:5010/delete/?proxy={}'.format(proxy))


class QiShu(object):
    ua = UserAgent()

    def __init__(self):
        self.headers = {
            'Host': 'www.qisuu.la',
            'User-Agent': self.ua.random
        }
        self.row = 1

    # 拿到分类的源码
    def get_list_page(self, url):
        proxy = get_proxy()
        # print('正在使用代理IP:{}请求页面{}'.format(proxy, url))
        proxies = {'http': 'http://' + proxy}
        try:
            response = requests.get(url, headers=self.headers, proxies=proxies)
            # print(response.status_code)
            if response.status_code == 200:
                # print('{}请求成功'.format(url))
                return response.text
            else:
                print('{}请求异常'.format(url))
                return None
        except ConnectionError as e:
            print('{}连接主机异常'.format(url))
            return None

    # 拿到每本书的href
    def parse_list_page(self, list_html):
        if list_html:
            hrefs = re.findall(re.compile(r'<div class="s">.*?<a href="(.*?)">', re.S), list_html)
            # 下一页href
            next_href = re.search(re.compile(r"上一页.*?<a href='(.*?)'>", re.S), list_html).groups()[0]
            hrefs.append(next_href)
            for href in hrefs:
                detail_url = 'https://www.qisuu.la' + href
                yield detail_url
        else:
            print('{}获取了没有数据的网页')

    # 拿到详情页的源码
    def get_detail_page(self, detail_url):
        proxy = get_proxy()
        # print('正在使用代理IP:{}请求页面{}'.format(proxy, detail_url))
        proxies = {'http': 'http://' + proxy}
        try:
            response = requests.get(detail_url, headers=self.headers, proxies=proxies)
            # print(response.status_code)
            if response.status_code == 200:
                response.encoding = 'utf-8'
                # print('{}请求成功'.format(detail_url))
                return response.text
            else:
                print('{}请求异常'.format(detail_url))
                return None
        except ConnectionError as e:
            print('{}连接主机异常'.format(detail_url))
            return None

    # 拿书的详情
    def parse_detail_page(self, detail_html):
        if detail_html:
            img = re.findall(re.compile(r'<div class="detail">.*?<img src="(.*?)" onerror="(.*?)">', re.S), detail_html)[0]
            pattern = re.compile(
                r'<div class="detail">.*?<h1>(.*?)</h1>.*?<li class="small">(.*?)</li>.*?<li class="small">(.*?)</li>.*?<li class="small">(.*?)</li>.*?<li class="small">(.*?)</li>.*?<li class="small">(.*?)</li>.*?<li class="small">(.*?)</li>.*?<li class="small">.*?<a.*?>(.*?)</a>',
                re.S)
            tuple = re.search(pattern, detail_html).groups()
            info = re.findall(
                re.compile(r'<div class="showBox mt20">.*?<h1(.*?).*?<div class="showInfo">.*?<p>(.*?)</p>', re.S),
                detail_html)[0]
            link = re.search(re.compile(r"get_down_url.*?,'(.*?)'", re.S), detail_html).groups()
            data = tuple + info + link
            # print(data)
            qishu.save_img(img)
            return data

        else:
            print('{}获取了没有数据的网页')

    def save_img(self, href):
        os.chdir('奇书网图片')
        href1 = 'https://www.qisuu.la' + href[0]
        href2 = 'https://www.qisuu.la' + href[1].split("'")[1]
        name1 = href1.split('/')[-1]
        name2 = href2.split('/')[-1]
        if requests.get(href1).status_code == 404:
            urllib.request.urlretrieve(href2, name2)

        else:
            urllib.request.urlretrieve(href1, name1)

    def open_file(self):
        # 1.创建workbook对象
        book = xlwt.Workbook(encoding='utf-8')
        # 2.创建选项卡
        sheet = book.add_sheet('奇书网')
        # 3.添加头
        # 第一个参数是行, 第二个是列
        sheet.write(0, 0, '书名')
        sheet.write(0, 1, '点击次数')
        sheet.write(0, 2, '文件大小')
        sheet.write(0, 3, '书籍类型')
        sheet.write(0, 4, '更新日期')
        sheet.write(0, 5, '连载状态')
        sheet.write(0, 6, '书籍作者')
        sheet.write(0, 7, '最新章节')
        sheet.write(0, 8, '小说介绍')
        sheet.write(0, 9, '连接')

        return book, sheet

    def write_data(self, data, sheet):
        os.chdir(os.path.pardir)
        title = data[0]
        dianji = data[1].split(':')[-1]
        wenjian = data[2].split(':')[-1]
        shuji = data[3].split(':')[-1]
        time = data[4].split(':')[-1]
        statue = data[5].split(':')[-1]
        author = data[6].split(':')[-1]
        zhangjie = data[7].split(':')[-1]
        info = data[9]
        link = data[10]
        sheet.write(self.row, 0, title)
        sheet.write(self.row, 1, dianji)
        sheet.write(self.row, 2, wenjian)
        sheet.write(self.row, 3, shuji)
        sheet.write(self.row, 4, time)
        sheet.write(self.row, 5, statue)
        sheet.write(self.row, 6, author)
        sheet.write(self.row, 7, zhangjie)
        sheet.write(self.row, 8, info)
        sheet.write(self.row, 9, link)
        book.save('奇书网小说.xls')
        self.row += 1

    def close_file(self, book):
        book.save('奇书网小说.xls')


def main(url, book, sheet):
    list_html = qishu.get_list_page(url)  # 拿到分类的源码
    if list_html:
        list_data = qishu.parse_list_page(list_html)  # 拿到每本书的href
        for detail_url in list_data:
            if 'index' not in detail_url:
                detail_html = qishu.get_detail_page(detail_url)  # 拿到每本书的源码
                if detail_html:
                    data = qishu.parse_detail_page(detail_html)  # 拿到每本书的详情
                    if data:
                        qishu.write_data(data, sheet)  # 保存数据
                        qishu.close_file(book)
                        print('{}写入'.format(data[0]))
            else:
                main(detail_url, book, sheet)


if __name__ == '__main__':
    qishu = QiShu()
    proxy = ""
    # ua = UserAgent()
    book, sheet = qishu.open_file()
    os.mkdir('奇书网图片')
    for x in range(1, 11):
        url = 'https://www.qisuu.la/soft/sort0{}/'.format(x)  # 分类网址
        main(url, book, sheet)


 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值