大数据批量下载文件 代码
# # 大数据批量下载文件
from multiprocessing import Process
import os
import pandas as pd
import numpy as np
from urllib import request
def download_from_url(df):
for index, row in df.iterrows():
url1 = row["link1"]
name1 = row["link1_name"]
url2 = row["link2"]
name2 = row["link2_name"]
url3 = row["link3"]
name3 = row["link3_name"]
list_name1 = []
list_name2 = []
list_name3 = []
try:
request.urlretrieve(url=url1, filename=name1)
except:
pass
list_name1.append(name1)
print(name1)
try:
request.urlretrieve(url=url2, filename=name2)
except:
pass
list_name1.append(name2)
print(name2)
try:
request.urlretrieve(url=url3, filename=name3)
except:
pass
list_name1.append(name3)
print(name3)
print(set(list_name1))
print(set(list_name2))
print(set(list_name3))
if __name__ == '__main__':
df = pd.read_csv(r"C:\Users\Sway\Desktop\downloading.csv")
os.chdir(r"C:\Users\Sway\Desktop\downloaded_image")
brock = 3
df_array = np.array_split(df, brock, axis=0)
pp = list(range(brock))
for i in range(len(pp)):
pp[i] = Process(target=download_from_url, args=(df_array[i],))
for p in pp:
p.start()
for p in pp:
p.join()
第二课 代码:
# # 大数据批量下载文件
from multiprocessing import Process
import os
import pandas as pd
import numpy as np
from urllib import request
import requests
from bs4 import BeautifulSoup
import re
def getHTMLText(url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
print(r.status_code)
r.encoding = r.apparent_encoding
file1 = open("try.txt", "w", encoding='utf-8')
file1.writelines(r.text)
file1.close()
return r.text
except:
return print("failed")
def download_from_url(df):
for index, row in df.iterrows():
url1 = row["link1"]
name1 = row["link1_name"]
url2 = row["link2"]
name2 = row["link2_name"]
url3 = row["link3"]
name3 = row["link3_name"]
list_name1 = []
list_name2 = []
list_name3 = []
try:
request.urlretrieve(url=url1, filename=name1)
except:
pass
list_name1.append(name1)
print(name1)
try:
request.urlretrieve(url=url2, filename=name2)
except:
pass
list_name1.append(name2)
print(name2)
try:
request.urlretrieve(url=url3, filename=name3)
except:
pass
list_name1.append(name3)
print(name3)
print(set(list_name1))
print(set(list_name2))
print(set(list_name3))
def download_from_url_dropbox(df):
for index, row in df.iterrows():
url1 = row["link1"]
name1 = row["link1_name"]
url2 = row["link2"]
name2 = row["link2_name"]
url3 = row["link3"]
name3 = row["link3_name"]
list_name1 = []
list_name2 = []
list_name3 = []
try:
html = getHTMLText(url=url1)
soup = BeautifulSoup(html, "html.parser")
mydivs = soup.findAll("img", {"class": "preview"})
a = re.findall(r'(https?://[^\s]+)', str(mydivs))
request.urlretrieve(url=a[-1], filename=name1)
except:
pass
list_name1.append(name1)
print(name1)
try:
html = getHTMLText(url=url2)
soup = BeautifulSoup(html, "html.parser")
mydivs = soup.findAll("img", {"class": "preview"})
a = re.findall(r'(https?://[^\s]+)', str(mydivs))
request.urlretrieve(url=a[-1], filename=name2)
except:
pass
list_name1.append(name2)
print(name2)
try:
html = getHTMLText(url=url3)
soup = BeautifulSoup(html, "html.parser")
mydivs = soup.findAll("img", {"class": "preview"})
a = re.findall(r'(https?://[^\s]+)', str(mydivs))
request.urlretrieve(url=a[-1], filename=name3)
except:
pass
list_name1.append(name3)
print(name3)
print(set(list_name1))
print(set(list_name2))
print(set(list_name3))
if __name__ == '__main__':
df = pd.read_csv(r"C:\Users\username\Desktop\downloading.csv", engine="python")
os.chdir(r"C:\Users\username\Desktop\downloaded_image")
brock = 3
df_array = np.array_split(df, brock, axis=0)
pp = list(range(brock))
for i in range(len(pp)):
# pp[i] = Process(target=download_from_url, args=(df_array[i],))
pp[i] = Process(target=download_from_url_dropbox, args=(df_array[i],))
for p in pp:
p.start()
for p in pp:
p.join()
视频链接:
第一课:
https://www.bilibili.com/video/BV1gV411m7FA
第二课:
https://www.bilibili.com/video/BV1Gt4y1q7K5