大数据批量下载文件 代码

大数据批量下载文件 代码

# # 大数据批量下载文件
from multiprocessing import Process
import os
import pandas as pd
import numpy as np
from urllib import request


def download_from_url(df):
    for index, row in df.iterrows():
        url1 = row["link1"]
        name1 = row["link1_name"]
        url2 = row["link2"]
        name2 = row["link2_name"]
        url3 = row["link3"]
        name3 = row["link3_name"]

        list_name1 = []
        list_name2 = []
        list_name3 = []

        try:
            request.urlretrieve(url=url1, filename=name1)
        except:
            pass
            list_name1.append(name1)
            print(name1)
        try:
            request.urlretrieve(url=url2, filename=name2)
        except:
            pass
            list_name1.append(name2)
            print(name2)
        try:
            request.urlretrieve(url=url3, filename=name3)
        except:
            pass
            list_name1.append(name3)
            print(name3)

        print(set(list_name1))
        print(set(list_name2))
        print(set(list_name3))

if __name__ == '__main__':
    df = pd.read_csv(r"C:\Users\Sway\Desktop\downloading.csv")
    os.chdir(r"C:\Users\Sway\Desktop\downloaded_image")

    brock = 3
    df_array = np.array_split(df, brock, axis=0)

    pp = list(range(brock))

    for i in range(len(pp)):
        pp[i] = Process(target=download_from_url, args=(df_array[i],))

    for p in pp:
        p.start()

    for p in pp:
        p.join()


第二课 代码:

# # 大数据批量下载文件
from multiprocessing import Process
import os
import pandas as pd
import numpy as np
from urllib import request
import requests
from bs4 import BeautifulSoup
import re


def getHTMLText(url):
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        print(r.status_code)
        r.encoding = r.apparent_encoding

        file1 = open("try.txt", "w", encoding='utf-8')
        file1.writelines(r.text)
        file1.close()
        return r.text
    except:
        return print("failed")


def download_from_url(df):
    for index, row in df.iterrows():
        url1 = row["link1"]
        name1 = row["link1_name"]
        url2 = row["link2"]
        name2 = row["link2_name"]
        url3 = row["link3"]
        name3 = row["link3_name"]

        list_name1 = []
        list_name2 = []
        list_name3 = []

        try:
            request.urlretrieve(url=url1, filename=name1)
        except:
            pass
            list_name1.append(name1)
            print(name1)
        try:
            request.urlretrieve(url=url2, filename=name2)
        except:
            pass
            list_name1.append(name2)
            print(name2)
        try:
            request.urlretrieve(url=url3, filename=name3)
        except:
            pass
            list_name1.append(name3)
            print(name3)

        print(set(list_name1))
        print(set(list_name2))
        print(set(list_name3))


def download_from_url_dropbox(df):
    for index, row in df.iterrows():
        url1 = row["link1"]
        name1 = row["link1_name"]
        url2 = row["link2"]
        name2 = row["link2_name"]
        url3 = row["link3"]
        name3 = row["link3_name"]

        list_name1 = []
        list_name2 = []
        list_name3 = []

        try:
            html = getHTMLText(url=url1)
            soup = BeautifulSoup(html, "html.parser")
            mydivs = soup.findAll("img", {"class": "preview"})
            a = re.findall(r'(https?://[^\s]+)', str(mydivs))
            request.urlretrieve(url=a[-1], filename=name1)
        except:
            pass
            list_name1.append(name1)
            print(name1)
        try:
            html = getHTMLText(url=url2)
            soup = BeautifulSoup(html, "html.parser")
            mydivs = soup.findAll("img", {"class": "preview"})
            a = re.findall(r'(https?://[^\s]+)', str(mydivs))
            request.urlretrieve(url=a[-1], filename=name2)
        except:
            pass
            list_name1.append(name2)
            print(name2)
        try:
            html = getHTMLText(url=url3)
            soup = BeautifulSoup(html, "html.parser")
            mydivs = soup.findAll("img", {"class": "preview"})
            a = re.findall(r'(https?://[^\s]+)', str(mydivs))
            request.urlretrieve(url=a[-1], filename=name3)
        except:
            pass
            list_name1.append(name3)
            print(name3)

        print(set(list_name1))
        print(set(list_name2))
        print(set(list_name3))


if __name__ == '__main__':
    df = pd.read_csv(r"C:\Users\username\Desktop\downloading.csv", engine="python")
    os.chdir(r"C:\Users\username\Desktop\downloaded_image")
    brock = 3
    df_array = np.array_split(df, brock, axis=0)

    pp = list(range(brock))

    for i in range(len(pp)):
        # pp[i] = Process(target=download_from_url, args=(df_array[i],))
        pp[i] = Process(target=download_from_url_dropbox, args=(df_array[i],))

    for p in pp:
        p.start()

    for p in pp:
        p.join()


视频链接:
第一课:
https://www.bilibili.com/video/BV1gV411m7FA
第二课:
https://www.bilibili.com/video/BV1Gt4y1q7K5

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值