大数据批量下载文件代码

最新推荐文章于 2024-07-17 20:48:25 发布

weixin_49387064

最新推荐文章于 2024-07-17 20:48:25 发布

阅读量690

点赞数

文章标签：大数据

本文链接：https://blog.csdn.net/weixin_49387064/article/details/108429533

版权

大数据批量下载文件代码

# # 大数据批量下载文件
from multiprocessing import Process
import os
import pandas as pd
import numpy as np
from urllib import request


def download_from_url(df):
    for index, row in df.iterrows():
        url1 = row["link1"]
        name1 = row["link1_name"]
        url2 = row["link2"]
        name2 = row["link2_name"]
        url3 = row["link3"]
        name3 = row["link3_name"]

        list_name1 = []
        list_name2 = []
        list_name3 = []

        try:
            request.urlretrieve(url=url1, filename=name1)
        except:
            pass
            list_name1.append(name1)
            print(name1)
        try:
            request.urlretrieve(url=url2, filename=name2)
        except:
            pass
            list_name1.append(name2)
            print(name2)
        try:
            request.urlretrieve(url=url3, filename=name3)
        except:
            pass
            list_name1.append(name3)
            print(name3)

        print(set(list_name1))
        print(set(list_name2))
        print(set(list_name3))

if __name__ == '__main__':
    df = pd.read_csv(r"C:\Users\Sway\Desktop\downloading.csv")
    os.chdir(r"C:\Users\Sway\Desktop\downloaded_image")

    brock = 3
    df_array = np.array_split(df, brock, axis=0)

    pp = list(range(brock))

    for i in range(len(pp)):
        pp[i] = Process(target=download_from_url, args=(df_array[i],))

    for p in pp:
        p.start()

    for p in pp:
        p.join()

第二课代码：

# # 大数据批量下载文件
from multiprocessing import Process
import os
import pandas as pd
import numpy as np
from urllib import request
import requests
from bs4 import BeautifulSoup
import re


def getHTMLText(url):
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        print(r.status_code)
        r.encoding = r.apparent_encoding

        file1 = open("try.txt", "w", encoding='utf-8')
        file1.writelines(r.text)
        file1.close()
        return r.text
    except:
        return print("failed")


def download_from_url(df):
    for index, row in df.iterrows():
        url1 = row["link1"]
        name1 = row["link1_name"]
        url2 = row["link2"]
        name2 = row["link2_name"]
        url3 = row["link3"]
        name3 = row["link3_name"]

        list_name1 = []
        list_name2 = []
        list_name3 = []

        try:
            request.urlretrieve(url=url1, filename=name1)
        except:
            pass
            list_name1.append(name1)
            print(name1)
        try:
            request.urlretrieve(url=url2, filename=name2)
        except:
            pass
            list_name1.append(name2)
            print(name2)
        try:
            request.urlretrieve(url=url3, filename=name3)
        except:
            pass
            list_name1.append(name3)
            print(name3)

        print(set(list_name1))
        print(set(list_name2))
        print(set(list_name3))


def download_from_url_dropbox(df):
    for index, row in df.iterrows():
        url1 = row["link1"]
        name1 = row["link1_name"]
        url2 = row["link2"]
        name2 = row["link2_name"]
        url3 = row["link3"]
        name3 = row["link3_name"]

        list_name1 = []
        list_name2 = []
        list_name3 = []

        try:
            html = getHTMLText(url=url1)
            soup = BeautifulSoup(html, "html.parser")
            mydivs = soup.findAll("img", {"class": "preview"})
            a = re.findall(r'(https?://[^\s]+)', str(mydivs))
            request.urlretrieve(url=a[-1], filename=name1)
        except:
            pass
            list_name1.append(name1)
            print(name1)
        try:
            html = getHTMLText(url=url2)
            soup = BeautifulSoup(html, "html.parser")
            mydivs = soup.findAll("img", {"class": "preview"})
            a = re.findall(r'(https?://[^\s]+)', str(mydivs))
            request.urlretrieve(url=a[-1], filename=name2)
        except:
            pass
            list_name1.append(name2)
            print(name2)
        try:
            html = getHTMLText(url=url3)
            soup = BeautifulSoup(html, "html.parser")
            mydivs = soup.findAll("img", {"class": "preview"})
            a = re.findall(r'(https?://[^\s]+)', str(mydivs))
            request.urlretrieve(url=a[-1], filename=name3)
        except:
            pass
            list_name1.append(name3)
            print(name3)

        print(set(list_name1))
        print(set(list_name2))
        print(set(list_name3))


if __name__ == '__main__':
    df = pd.read_csv(r"C:\Users\username\Desktop\downloading.csv", engine="python")
    os.chdir(r"C:\Users\username\Desktop\downloaded_image")
    brock = 3
    df_array = np.array_split(df, brock, axis=0)

    pp = list(range(brock))

    for i in range(len(pp)):
        # pp[i] = Process(target=download_from_url, args=(df_array[i],))
        pp[i] = Process(target=download_from_url_dropbox, args=(df_array[i],))

    for p in pp:
        p.start()

    for p in pp:
        p.join()