下载目录
文章代码仅供学术讨论。
import requests
import urllib.request # url request
import os # dirs
import time
from bs4 import BeautifulSoup # 用于解析网页
import requests
import bs4
import tqdm
from retry import retry
import datetime
import sys
@retry(requests.exceptions.ConnectionError, delay=2)
def download_pypi_resource():
text = requests.get("https://pypi.tuna.tsinghua.edu.cn/simple").text
bs = bs4.BeautifulSoup(text, "lxml")
context = bs.find_all("a")
path = "C:\\Users\\82172\\PycharmProjects\\flaskTest\\common_test\\pypi.txt"
with open(path, "w") as f:
for i in tqdm.tqdm(range(len(context)), total=len(context), desc="进度"):
f.write(str(context[i].text) + "\n")
def batch_download_pypi_resource():
num = 30000
name = 1
text = requests.get("https://pypi.tuna.tsinghua.edu.cn/simple").text
bs = bs4.BeautifulSoup(text, "lxml")
context = bs.find_all("a")
path = "C:\\dir_name\\"
resource_list = []
count = 0
for i in tqdm.tqdm(range(len(context)), total=len(context), desc="进度"):
resource_list.append(str(context[i].text))
count += 1
if count % num == 0:
file_name = "{}{}".format(str(name), ".txt")
name += 1
with open("{}{}".format(path, file_name), "w") as f:
for item in resource_list:
f.write(item + "\n")
print("\n")
print("{}文件中有资源{}个".format(file_name, len(resource_list)))
resource_list = []
# 将剩余的资源写入
file_name = "{}{}".format(str(name), ".txt")
with open("{}{}".format(path, file_name), "w") as f:
for item in resource_list:
f.write(item + "\n")
print("\n")
print("{}文件中有资源{}个".format(file_name, len(resource_list)))
# batch_download_pypi_resource()
# download_pypi_resource()
爬取程序
其中download_pypi_resource()为直接爬取目录,batch_download_pypi_resource()为将其拆分为3w一条写入文件,为了以后多开程序进行爬取。
def batchDownload(url_name, directory):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
}
# req = Request(url_name, headers=headers)
html = requests.get(url_name, timeout=20).text
bsObj = BeautifulSoup(html, 'html.parser')
t1 = bsObj.find_all('a')
url_list = []
for t2 in t1:
t3 = t2.get('href')
url_list.append(t3)
# set函数消除重复元素
hset = set(url_list)
# 下载链接
for href in hset:
# 之所以if else 是为了区别只有一个链接的特别情况
link = url_name + href
name = href.split("/")[-1].split("#")[0]
filename = os.path.join(directory, name)
# filename = directory
# print(filename)
# os.mkdir(filename)
try:
req = requests.get(url_name)
with open(filename, "wb") as f:
f.write(req.content)
# urllib.request.urlretrieve(link, filename)
except:
with open("/data/logs.txt", "a") as f:
f.write(name + "\n")
f.close()
# print("成功下载!")
print("成功下载", filename)
# 无sleep间隔,网站认定这种行为是攻击,反反爬虫
time.sleep(1)
origin_url = 'https://pypi.tuna.tsinghua.edu.cn/simple/'
script_path = sys.argv[0]
origin_path = "./dir_name/2.txt"
file = open(origin_path)
count = 0
# 获取其中data_list
data_list = []
with open(origin_path, 'r') as f:
for line in f.readlines():
line = line.strip('\n')
data_list.append(line)
try:
with open(origin_path, 'r') as f:
for line in f.readlines():
dir_name = line.strip('\n')
url = "{}{}{}".format(origin_url, line.strip('\n'), '/')
print(dir_name)
# file_path = os.path.join('./pypi_data1', dir_name)
file_path = './pypi_data1'
if not os.path.exists(file_path):
os.mkdir(file_path)
batchDownload(url, file_path)
if dir_name in data_list:
data_list.remove(dir_name)
except Exception as e:
print("*" * 10)
print(e)
print("*" * 10)
finally:
with open(origin_path, 'w') as f:
for line in data_list:
f.write(line + "\n")
最后,结合我写的监听爬虫程序自动重启博客,完成pypi源的爬取。