第一个小爬虫——下书网下载小说v1

 第一个小爬虫,问题较多!

import urllib.request
import re
import os
import string
import time
import random

path = os.getcwd()  # 获取当前路径

def get_url():


def open_url(url):
    req = urllib.request.Request(url)
    req.add_header('User-Agent',
                   'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36')
    page = urllib.request.urlopen(url)
    html = page.read().decode('utf-8')
    return html

def get_txt(html):
    lst1 = str(r'<head>[\s\S]*<title>.*</title>')
    lst2 = (r'<div id="tac">[\s\S]*<div class="info bottominfo">')
    l1 = str(re.findall(lst1,html))
    l1 = l1[51:].replace('\'','').replace('\"', '').replace('>]','')
    l1list = l1.split(',')[0]
    l2 = str(re.findall(lst2,html))
    l2 = l2[92:].replace(r'\u3000\u3000','      ').replace('<br/><br/>','\n')[:-60]
    l2 = re.sub('\*',' ',l2)
    l2 = str(l2)
    f = open(path+r'\\%s.txt'%(l5),'a')
    f.write(l1list)
    f.write('\n\n')
    f.write(l2)
    f.write('\n\n\n')
    print(l1list + '→→→下载完成→→→')

def get_titlename(html):
    lst3 = str(r'<head>[\s\S]*<title>.*</title>')
    l3 = str(re.findall(lst3,html))
    l3 = l3[43:].split('_')[0].replace('txt下载','\n  ——').replace('(','').replace(')','')
    print(l3 + '→正在下载')
    f = open(path+r'\\%s.txt'%(l5),'a')
    f.write(l3)
    f.write('\n\n')
    print(l3 + '→→→titlename下载完成→→→')

def get_txtname(html):
    lst4 = str(r'<head>[\s\S]*<title>.*</title>')
    l4 = str(re.findall(lst4,html))
    l5 = l4[43:].split('txt')[0]
    f = open(path+r'\\%s.txt'%(l5),'a')
    f.close
    return l5

if __name__ == '__main__':
    print('\n使用说明:'
          '示例:《武道乾坤》,URL https://www.xiashu.la/2186/  ,该书目录为即为2186')
    url0 = 'https://www.xiashu.la'
    ml = input('请输入目录')
    url1 = url0 + r'/' + ml + r'/'
    print('你输入的目录为:%s'%url1)
    chapters = input('请输入总章节数(示例80页,则输入80):')
    chapters = int(chapters)
    print("当前工作目录 : %s" % path)
    get_txtname(open_url(url1))
    l5 = get_txtname(open_url(url1))
    get_titlename(open_url(url1))
    for chapter in range(1,chapters+1):
        url = url1 +'read_'+ str(chapter) + '.html'
        t = random.randint(1,5)
        print(t)
        time.sleep(t)#单位:秒
        get_txt(open_url(url))

 

转载于:https://www.cnblogs.com/lasttime/p/10717619.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值