【python3应用】单线程下载多种格式文件

-

说明:对于譬如百度爬虫能爬到的页面内容,此python爬虫不当作破解活动。

-

封装好的单线程爬取文件的python3爬虫:

1. sleep延时对对方服务器友好,不至于拖死下行带宽;

2. 单线程更好的控制文件下载断点;

3. 同步特性完成文件的批量下载。

-

全部代码如下:

(使用过程中会有一小部分的文件下载为空或者下载报错跳过,可能原因是对方服务器文件编码错误或本地电脑内容爆了,但不至于抓取过程断了,一律当作跳过处理)

import requests  # pip install requests
from bs4 import BeautifulSoup
import urllib.request
import os
import sys
import re
import time
import _thread
import chardet
from urllib import parse
import random


# 向txt文本写入内容
def write_txt(filename, info):
    with open(filename, 'a', encoding='utf-8') as txt:
        txt.write(info + '\n\n')
        pass
    pass


# GET请求
def request_get(get_url=''):
    get_response = requests.get(get_url)
    res = get_response.text
    return res
    pass


# POST请求
def request_post(post_url='', data_dict=None):
    if data_dict is None:
        data_dict = {'test': 'my test-post data', 'create_time': '2019'}  # data示例字典格式
    res = requests.post(url=post_url, data=data_dict, headers={'Content-Type': 'application/x-www-form-urlencoded'})
    return res
    pass


# 使用代理(GET)请求接口
def use_proxy_request_api(url, proxy_addr='122.241.72.191:808'):
    req = urllib.request.Request(url)
    req.add_header("User-Agent",
                   "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0")
    proxy = urllib.request.ProxyHandler({'http': proxy_addr})
    opener = urllib.request.build_opener(proxy, urllib.request.HTTPHandler)
    urllib.request.install_opener(opener)
    res = urllib.request.urlopen(req).read().decode('utf-8', 'ignore')
    return res
    pass


# ############################################################################

# 获取url参数
def get_url_param(url='', key=''):
    array = parse.parse_qs(parse.urlparse(url).query)
    return array[key]
    pass


# 获取url网页
def get_url_html(url, state=0):
    if state == 0:
        url = domain + url
        pass
    else:
        url = url
        pass
    # 获取网页
    # 任意请求头
    hearder_list = [
        {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.16 Safari/537.36 Edg/80.0.361.9'},
        {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'},
        {
            'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.3 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1 wechatdevtools/1.02.1910120 MicroMessenger/7.0.4 Language/zh_CN webview/15780410115046065 webdebugger port/41084'},
        {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0'},
        {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0'},
    ]
    index = random.randint(0, len(hearder_list) - 1)
    headers = hearder_list[index]
    req = urllib.request.Request(url=url, headers=headers)
    try:
        response = urllib.request.urlopen(req)
        pass
    except:
        return
        pass

    try:
        page = response.read().decode('utf-8')  # 编码格式gb2312,utf-8,GBK
        pass
    except:
        page = response.read().decode('gb2312')  # 编码格式gb2312,utf-8,GBK
        pass

    html_string_page = str(page)  # 转换成string,可以直接向数据库添加
    soup_page = BeautifulSoup(html_string_page, "html.parser")  # 解析网页标签

    return soup_page
    pass


# 抓取 文章内容
def get_html3(url, class_name, title):
    soup_page = get_url_html(url, 1)

    print(url)
    if soup_page is None:
        print('break')
        sys.exit()
        pass

    # print(soup_page)

    a = soup_page.find('a', attrs={'class', 'article-download'})

    file_href = a.get('href')
    file_info = file_href.split(".")
    try:
        file_info_path = file_info[-2].split("/")

        # print(a)
        print(title)
        print(file_href)
        print(file_info_path[-1])
        print(file_info[-1])

        root = "D:/python38/demo/wendu_file/"  # 没有最后一级文件夹目录则会自动创建

        path = root + class_name + '_' + title + '_' + file_info_path[-1] + '.' + file_info[-1]  # 文件绝对路径

        # 保存文件
        try:
            if not os.path.exists(root):
                os.mkdir(root)
            if not os.path.exists(path):
                r = requests.get(file_href)
                r.raise_for_status()
                # 使用with语句可以不用自己手动关闭已经打开的文件流
                with open(path, "wb") as f:  # 开始写文件,wb代表写二进制文件
                    f.write(r.content)

                print("保存文件成功=")
            else:
                print("文件已存在")
        except Exception as e:
            print("文件保存失败:" + str(e))

        pass

    except Exception as e:
        print("文件失效:" + str(e))

    # 保存
    # post_url = 'app/save_article'
    # post_data = {
    #     'course_class_id': pre_class_id,  # 对应id,手动
    #     'class_name': class_name,
    #     'title': title,
    #     'url': url,
    #     'content': content,
    #     'description': description,
    #     'div_content': div_content,
    # }
    # # print(post_data)
    # res = request_post(api + post_url, post_data)
    # print(res)


    pass


# 抓取 目录
def get_html2(url):
    soup_page = get_url_html(url, 1)

    print(url)

    if soup_page is None:
        sys.exit()
        pass

    that_div = soup_page.find('div', attrs={'class', 'date-word'}).find_all('div', attrs={'class', 'date-load'})
    that_h = soup_page.find_all('a', attrs={'class', 'current'})[1].get_text()

    if that_h is None:
        that_h = '全部'

    # print(that_div)
    # print(that_h)

    for j in range(0, len(that_div)-1):
        print('===' + str(j) + '===')

        div = that_div[j]
        a = div.find('div', attrs={'class', 'date-load-fl'}).find('a')
        # a_txt = a.get_text()
        a_txt = a.get_text()
        a_href = a.get('href')

        # print(a)
        # print([a_txt, a_href, that_h])
        get_html3(a_href, pre_class_name + that_h, a_txt)

        time.sleep(0.8)

        pass


'参数'
api = 'http://192.168.131.129/pydata/public/index.php/api/'
domain = 'http://xxxxx'  # 主网址
pre_class_id = 9
pre_class_name = '医考_'
if __name__ == '__main__':  # 函数执行入口
    print('---开始---')
    # 在此启动函数

    url = 'https://www.wendu.com/index.php?m=content&c=index&a=lists&catid=302&siteid=1&page='

    for a in range(1, 50):
        _url = url + str(a)
        # print(_url)

        get_html2(_url)

        time.sleep(2)
        pass

    print('---完成---')
    pass

-

-

-

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值