【python3应用】单线程下载多种格式文件

最新推荐文章于 2023-05-17 15:42:49 发布

userxxcc

最新推荐文章于 2023-05-17 15:42:49 发布

阅读量210

点赞数

分类专栏：爬虫/bs4/Scrapy/Selenuim

本文链接：https://blog.csdn.net/weixin_41827162/article/details/104405038

版权

爬虫/bs4/Scrapy/Selenuim 专栏收录该内容

17 篇文章 1 订阅

订阅专栏

说明：对于譬如百度爬虫能爬到的页面内容，此python爬虫不当作破解活动。

封装好的单线程爬取文件的python3爬虫：

1. sleep延时对对方服务器友好，不至于拖死下行带宽；

2. 单线程更好的控制文件下载断点；

3. 同步特性完成文件的批量下载。

全部代码如下：

（使用过程中会有一小部分的文件下载为空或者下载报错跳过，可能原因是对方服务器文件编码错误或本地电脑内容爆了，但不至于抓取过程断了，一律当作跳过处理）

import requests  # pip install requests
from bs4 import BeautifulSoup
import urllib.request
import os
import sys
import re
import time
import _thread
import chardet
from urllib import parse
import random


# 向txt文本写入内容
def write_txt(filename, info):
    with open(filename, 'a', encoding='utf-8') as txt:
        txt.write(info + '\n\n')
        pass
    pass


# GET请求
def request_get(get_url=''):
    get_response = requests.get(get_url)
    res = get_response.text
    return res
    pass


# POST请求
def request_post(post_url='', data_dict=None):
    if data_dict is None:
        data_dict = {'test': 'my test-post data', 'create_time': '2019'}  # data示例字典格式
    res = requests.post(url=post_url, data=data_dict, headers={'Content-Type': 'application/x-www-form-urlencoded'})
    return res
    pass


# 使用代理（GET）请求接口
def use_proxy_request_api(url, proxy_addr='122.241.72.191:808'):
    req = urllib.request.Request(url)
    req.add_header("User-Agent",
                   "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0")
    proxy = urllib.request.ProxyHandler({'http': proxy_addr})
    opener = urllib.request.build_opener(proxy, urllib.request.HTTPHandler)
    urllib.request.install_opener(opener)
    res = urllib.request.urlopen(req).read().decode('utf-8', 'ignore')
    return res
    pass


# ############################################################################

# 获取url参数
def get_url_param(url='', key=''):
    array = parse.parse_qs(parse.urlparse(url).query)
    return array[key]
    pass


# 获取url网页
def get_url_html(url, state=0):
    if state == 0:
        url = domain + url
        pass
    else:
        url = url
        pass
    # 获取网页
    # 任意请求头
    hearder_list = [
        {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.16 Safari/537.36 Edg/80.0.361.9'},
        {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'},
        {
            'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.3 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1 wechatdevtools/1.02.1910120 MicroMessenger/7.0.4 Language/zh_CN webview/15780410115046065 webdebugger port/41084'},
        {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0'},
        {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0'},
    ]
    index = random.randint(0, len(hearder_list) - 1)
    headers = hearder_list[index]
    req = urllib.request.Request(url=url, headers=headers)
    try:
        response = urllib.request.urlopen(req)
        pass
    except:
        return
        pass

    try:
        page = response.read().decode('utf-8')  # 编码格式gb2312,utf-8,GBK
        pass
    except:
        page = response.read().decode('gb2312')  # 编码格式gb2312,utf-8,GBK
        pass

    html_string_page = str(page)  # 转换成string，可以直接向数据库添加
    soup_page = BeautifulSoup(html_string_page, "html.parser")  # 解析网页标签

    return soup_page
    pass


# 抓取 文章内容
def get_html3(url, class_name, title):
    soup_page = get_url_html(url, 1)

    print(url)
    if soup_page is None:
        print('break')
        sys.exit()
        pass

    # print(soup_page)

    a = soup_page.find('a', attrs={'class', 'article-download'})

    file_href = a.get('href')
    file_info = file_href.split(".")
    try:
        file_info_path = file_info[-2].split("/")

        # print(a)
        print(title)
        print(file_href)
        print(file_info_path[-1])
        print(file_info[-1])

        root = "D:/python38/demo/wendu_file/"  # 没有最后一级文件夹目录则会自动创建

        path = root + class_name + '_' + title + '_' + file_info_path[-1] + '.' + file_info[-1]  # 文件绝对路径

        # 保存文件
        try:
            if not os.path.exists(root):
                os.mkdir(root)
            if not os.path.exists(path):
                r = requests.get(file_href)
                r.raise_for_status()
                # 使用with语句可以不用自己手动关闭已经打开的文件流
                with open(path, "wb") as f:  # 开始写文件，wb代表写二进制文件
                    f.write(r.content)

                print("保存文件成功=")
            else:
                print("文件已存在")
        except Exception as e:
            print("文件保存失败:" + str(e))

        pass

    except Exception as e:
        print("文件失效:" + str(e))

    # 保存
    # post_url = 'app/save_article'
    # post_data = {
    #     'course_class_id': pre_class_id,  # 对应id，手动
    #     'class_name': class_name,
    #     'title': title,
    #     'url': url,
    #     'content': content,
    #     'description': description,
    #     'div_content': div_content,
    # }
    # # print(post_data)
    # res = request_post(api + post_url, post_data)
    # print(res)


    pass


# 抓取 目录
def get_html2(url):
    soup_page = get_url_html(url, 1)

    print(url)

    if soup_page is None:
        sys.exit()
        pass

    that_div = soup_page.find('div', attrs={'class', 'date-word'}).find_all('div', attrs={'class', 'date-load'})
    that_h = soup_page.find_all('a', attrs={'class', 'current'})[1].get_text()

    if that_h is None:
        that_h = '全部'

    # print(that_div)
    # print(that_h)

    for j in range(0, len(that_div)-1):
        print('===' + str(j) + '===')

        div = that_div[j]
        a = div.find('div', attrs={'class', 'date-load-fl'}).find('a')
        # a_txt = a.get_text()
        a_txt = a.get_text()
        a_href = a.get('href')

        # print(a)
        # print([a_txt, a_href, that_h])
        get_html3(a_href, pre_class_name + that_h, a_txt)

        time.sleep(0.8)

        pass


'参数'
api = 'http://192.168.131.129/pydata/public/index.php/api/'
domain = 'http://xxxxx'  # 主网址
pre_class_id = 9
pre_class_name = '医考_'
if __name__ == '__main__':  # 函数执行入口
    print('---开始---')
    # 在此启动函数

    url = 'https://www.wendu.com/index.php?m=content&c=index&a=lists&catid=302&siteid=1&page='

    for a in range(1, 50):
        _url = url + str(a)
        # print(_url)

        get_html2(_url)

        time.sleep(2)
        pass

    print('---完成---')
    pass

userxxcc

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
【python3应用】单线程下载多种格式文件

-说明：对于譬如百度爬虫能爬到的页面内容，此python爬虫不当作破解活动。-封装好的单线程爬取文件的python3爬虫：1. sleep延时对对方服务器友好，不至于拖死下行带宽；2. 单线程更好的控制文件下载断点；3. 同步特性完成文件的批量下载。-全部代码如下：（使用过程中会有一小部分的文件下载为空或者下载报错跳过，可能原因是对方服务器文件编码错误或本地电脑...
复制链接

扫一扫

专栏目录